diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index c7c55f402..ef4a0dbcf 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -39,6 +39,9 @@ anakin_add_compile_option(-Wshadow)
 anakin_add_compile_option(-fpermissive)
 anakin_add_compile_option(-Wsign-promo)
 anakin_add_compile_option(-fdiagnostics-show-option)
+if(USE_BM_PLACE)
+    anakin_add_compile_option(-lbmlib-asic)
+endif()
 
 if(ENABLE_NOISY_WARNINGS)
 	anakin_add_compile_option(-Wcast-align)
diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 6a94f32b0..a06ea879b 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -357,24 +357,22 @@ macro(anakin_find_openmp)
 endmacro()
 
 macro(anakin_find_bmlib)
-    find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/third-party/bm_lib/ $ENV{BM_ROOT}/) 
-    find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn) 
-    find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime) 
-    find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib) 
-    if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB) 
-        set(BM_FOUND TRUE) 
-    endif() 
-    if(BM_FOUND) 
-        message(STATUS " Found bm_lib in ${BM_ROOT}  ${BM_ROOT_INCLUDE_DNN} ${BM_ROOT_INCLUDE_RT} ${BM_ROOT_INCLUDE_LIB}")
-        include_directories(${BM_ROOT_INCLUDE_DNN})
-        include_directories(${BM_ROOT_INCLUDE_RT}) 
-        include_directories(${BM_ROOT_INCLUDE_LIB}) 
-        set(BM_LIBRARIES "") 
-        list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so) 
-        list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmlib_device.so)
-        list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so) 
-        list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) 
-    else() 
-        message(FATAL_ERROR "Could not found bm_lib") 
-    endif()
+	find_path(BM_ROOT include/bmlib/bmlib_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+	if(BM_ROOT)
+		set(BM_FOUND TRUE)
+	endif()
+	if(BM_FOUND)
+		message(STATUS " Found bm_lib in ${BM_ROOT}")
+		anakin_fetch_include_recursively(${BM_ROOT}/include)
+		set(BM_LIBRARIES "")
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/bmlib.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/common-arm.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/common.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/fw-arm.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/fw-top.a)
+		list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmlib-asic.so)
+		list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
+	else()
+		message(FATAL_ERROR "Could not found bm_lib")
+	endif()
 endmacro()
diff --git a/framework/core/net/worker.cpp b/framework/core/net/worker.cpp
index 0e8e436da..9d2b70b72 100644
--- a/framework/core/net/worker.cpp
+++ b/framework/core/net/worker.cpp
@@ -106,7 +106,8 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
             d_tensor_in_p->reshape(ins[i].valid_shape());
             d_tensor_in_p->copy_from(ins[i]);
             d_tensor_in_p->set_seq_offset(ins[i].get_seq_offset());
-        } 
+        }
+#ifdef NVIDIA_GPU 
         Context<NV> ctx(0, 0, 0); 
         saber::SaberTimer<NV> my_time; 
         my_time.start(ctx);
@@ -114,7 +115,7 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
         Context<NV> ctx(0, 0, 0); 
         saber::SaberTimer<NV> my_time;
         my_time.start(ctx);
-#endif
+#endif // ENABLE_OP_TIMER
         net.prediction(); 
 
         my_time.end(ctx); 
@@ -127,7 +128,8 @@ Worker<Ttype, Ptype, RunType>::sync_prediction(std::vector<Tensor4d<typename tar
             _thead_id_to_prediction_times_vec_in_ms[std::this_thread::get_id()].push_back(my_time.get_average_ms());
             LOG(ERROR) << " exec  << time: " << my_time.get_average_ms() << " ms ";
         }
-#endif
+#endif // ENABLE_OP_TIMER
+#endif // NVIDIA_GPU
         // get outputs of graph
         std::vector<Tensor4d<typename target_host<Ttype>::type>> ret;
         ret.resize(_outputs_in_order.size());
@@ -226,6 +228,16 @@ template class Worker<NV, Precision::FP16, OpRunType::SYNC>;
 template class Worker<NV, Precision::INT8, OpRunType::SYNC>;
 #endif
 
+#ifdef AMD_GPU
+template class Worker<AMD, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<AMD, Precision::FP16, OpRunType::ASYNC>;
+template class Worker<AMD, Precision::INT8, OpRunType::ASYNC>;
+
+template class Worker<AMD, Precision::FP32, OpRunType::SYNC>;
+template class Worker<AMD, Precision::FP16, OpRunType::SYNC>;
+template class Worker<AMD, Precision::INT8, OpRunType::SYNC>;
+#endif
+
 #ifdef USE_X86_PLACE
 template class Worker<X86, Precision::FP32, OpRunType::ASYNC>;
 template class Worker<X86, Precision::FP16, OpRunType::ASYNC>;
@@ -256,4 +268,3 @@ template class Worker<ARM, Precision::INT8, OpRunType::SYNC>;
 #endif
 
 } /* namespace */
-
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index c13f3df61..c31f7f941 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -92,6 +92,31 @@ if(USE_CUDA)
 				      ${WHOLE_ARCHIVE_END})	
 endif()
 
+if(USE_BM_PLACE)
+    set(BIN_NAME bmkernel_bin)
+    set(LINK_CONFIG link/bm1682_ddr.lds)
+    add_custom_command(OUTPUT bm_kernel_tmp
+        COMMAND arm-none-eabi-gcc ${ANAKIN_SABER}/funcs/impl/bm/device/bmkernel_base.c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/config -I${BM_ROOT}/include/common -I${BM_ROOT}/include/c_model -I${BM_ROOT}/include/firmware_core -I${BM_ROOT}/include/bmlib -c -o ${BIN_NAME}.o
+        COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${BIN_NAME}.o ${BM_ROOT}/lib/device/fw-top.a ${BM_ROOT}/lib/device/fw-arm.a -Wl,--end-group
+        COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin
+        COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim
+        COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin
+        COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_ddr.bin > ${BIN_NAME}_ddr.hex.sim
+        COMMAND printf "%x" 0xAABBCCDD > ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+        COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin
+
+        COMMAND printf \"%x\" `wc -c < ${BIN_NAME}_itcm.hex.sim` >> ${BIN_NAME}.bin
+
+        COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin
+        COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin
+        COMMAND cp ${ANAKIN_ROOT}/build/saber/bmkernel_bin.bin /var/tmp/${BIN_NAME}.bin
+        COMMENT "BM Kernel compilation..."
+    )
+    add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp)
+endif()
+
 # add saber library to static
 if(UNIX OR APPLE)
     if (USE_ARM_PLACE)
diff --git a/saber/core/common.h b/saber/core/common.h
index 73316bcf0..3a64b0d5d 100644
--- a/saber/core/common.h
+++ b/saber/core/common.h
@@ -179,11 +179,9 @@ const char* cudnn_get_errorstring(cudnnStatus_t status);
 #ifdef USE_BM_PLACE 
 
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
-#include "bmdnn_ext_api.h"
 #include "bmlib_utils.h"
 
-#define BMDNN_CHECK(condition) \
+#define BM_CHECK(condition) \
   do { \
     bm_status_t error = condition; \
     CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \
diff --git a/saber/core/context.h b/saber/core/context.h
index 0cc032f2f..fc21bc755 100644
--- a/saber/core/context.h
+++ b/saber/core/context.h
@@ -35,7 +35,7 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
-#ifdef USE_BM        
+#ifdef USE_BM_PLACE        
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
             int dev_count = 0;
@@ -69,7 +69,7 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         if(std::is_same<TargetType, BM>::value){
             LOG(INFO) << "context init for BM";
             _bm_handle = ctx._bm_handle;
@@ -98,7 +98,7 @@ class Context final{
         this->_act_ids = ctx._act_ids;
         this->_mode = ctx._mode;
 #endif
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         this->_bm_handle = ctx._bm_handle;
 #endif
         return *this;
@@ -109,7 +109,7 @@ class Context final{
         comp_eq = comp_eq && (_device_id == right._device_id);
         comp_eq = comp_eq && (_data_stream_id == right._data_stream_id);
         comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id);
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
         comp_eq = comp_eq && (_bm_handle == right._bm_handle);
 #endif
         return comp_eq;
@@ -151,7 +151,7 @@ class Context final{
     //std::vector<int> get_act_ids();
 #endif
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     bm_handle_t get_handle() {
         return _bm_handle;
     }
@@ -170,7 +170,7 @@ class Context final{
     PowerMode _mode{SABER_POWER_HIGH};
     std::vector<int> _act_ids{0};
 #endif
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     bm_handle_t _bm_handle;
 #endif
 };
diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h
index c552ce258..342331caf 100644
--- a/saber/core/data_traits.h
+++ b/saber/core/data_traits.h
@@ -20,7 +20,6 @@
 
 #ifdef USE_BM_PLACE 
 #include "bmlib_runtime.h"
-#include "bmdnn_api.h"
 #include "bmlib_utils.h"
 #endif
 
diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
index 1505acbf9..dd1a194c8 100644
--- a/saber/core/impl/bm/bm_impl.cpp
+++ b/saber/core/impl/bm/bm_impl.cpp
@@ -45,19 +45,19 @@ typedef TargetWrapper<BM, __device_target> BM_API;
 
 // Init handle only once in the lifetime
 static bm_handle_t handle;
-static bm_status_t init_handle{bmdnn_init(&handle)};
+static bm_status_t init_handle{bmlib_kernel_init(&handle)};
 
 bm_handle_t BM_API::get_handle() {
     return handle;
 };
 
 void BM_API::get_device_count(int& count) {
-    BMDNN_CHECK(bm_dev_getcount(&count));
+    BM_CHECK(bm_dev_getcount(&count));
 }
 
 void BM_API::set_device(int id) {
     //(bm_handle_t &handle, bool bmkernel_used, int id){
-    //BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+    //BM_CHECK(bm_dev_request(&handle, 0, id));
 }
 
 //TODO: Do we have this functionality?
@@ -69,12 +69,12 @@ void BM_API::mem_alloc(TPtr* ptr, size_t n) {
     /* bm_device_mem_t *mem = reinterpret_cast<struct bm_mem_desc *>(*ptr); */
     //    bm_device_mem_t *mem = new bm_device_mem_t();
     bm_device_mem_t mem;
-    BMDNN_CHECK(bm_malloc_device_byte(handle, &mem, n));
+    BM_CHECK(bm_malloc_device_byte(handle, &mem, n));
     *ptr = TPtr(mem);
 }
 
 void BM_API::mem_free(TPtr ptr) {
-    if ((ptr != BM_MEM_NULL)) {
+    if (bm_mem_get_type(ptr) == BM_MEM_TYPE_SYSTEM) {
         bm_free_device(handle, ptr);
         //        delete ptr;
     }
@@ -82,9 +82,9 @@ void BM_API::mem_free(TPtr ptr) {
 
 void BM_API::mem_set(TPtr ptr, int value, size_t n) {
     //(bm_handle_t handle, const int value, bm_device_mem_t mem){
-    BMDNN_CHECK(bm_memset_device(handle, value, ptr));
+    BM_CHECK(bm_memset_device(handle, value, ptr));
     //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr);
-    //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
+    //BM_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
 void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
@@ -92,8 +92,8 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __DtoD) {
     if(count==0)
         return;
-    //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
-    BMDNN_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count));
+    //BM_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    BM_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count));
 };
 
 void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
@@ -101,7 +101,7 @@ void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         size_t count, __HtoD) {
     if(count==0)
         return;
-    BMDNN_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast<void*>(src)+src_offset)));
+    BM_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast<void*>(src)+src_offset)));
 
 #ifdef DEBUG
 
@@ -118,7 +118,7 @@ void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
     if(count==0)
         return;
 //    LOG(INFO)<<"host ptr = "<<(dst)<<",dst_offset = "<<dst_offset<<", dev ptr = "<<(src)<<",dev offset = "<<src_offset;
-    BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst+dst_offset), src+src_offset));
+    BM_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst+dst_offset), src+src_offset));
 
 #ifdef DEBUG
 
diff --git a/saber/core/impl/bm/tensor_op_bm.cpp b/saber/core/impl/bm/tensor_op_bm.cpp
index 8380d145a..7c14c21da 100644
--- a/saber/core/impl/bm/tensor_op_bm.cpp
+++ b/saber/core/impl/bm/tensor_op_bm.cpp
@@ -6,13 +6,15 @@ namespace saber {
 template<>
 void fill_tensor_const<BM>(Tensor<BM>& tensor, float value,
                            typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_const(temp_tensor, value);
     tensor.copy_from(temp_tensor);
 }
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_rand(temp_tensor);
     tensor.copy_from(temp_tensor);
 }
@@ -20,15 +22,17 @@ void fill_tensor_rand<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t
 template<>
 void fill_tensor_rand<BM>(Tensor<BM>& tensor, float vstart, float vend,
                           typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     fill_tensor_rand(temp_tensor, vstart, vend);
     tensor.copy_from(temp_tensor);
 }
 
 template<>
 void print_tensor<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    LOG(INFO) << "device tensor data";
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    LOG(INFO) << "BM device tensor data:";
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     print_tensor(temp_tensor);
 }
@@ -41,7 +45,8 @@ void print_tensor_valid<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream
 
 template<>
 double tensor_mean_value<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     return tensor_mean_value(temp_tensor);
 }
@@ -49,7 +54,8 @@ double tensor_mean_value<BM>(Tensor<BM>& tensor, typename Tensor<BM>::API::strea
 template<>
 double tensor_mean_value_valid<BM>(Tensor<BM>& tensor,
                                    typename Tensor<BM>::API::stream_t stream = NULL) {
-    Tensor<X86> temp_tensor(tensor.valid_shape(),tensor.get_dtype());
+    Tensor<X86> temp_tensor(tensor.shape(), tensor.get_dtype());
+    temp_tensor.set_shape(tensor.valid_shape());
     temp_tensor.copy_from(tensor);
     return tensor_mean_value(temp_tensor);
 }
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index b888a5e24..6991119c8 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -450,7 +450,7 @@ struct TargetWrapper<BM, __device_target> {
     static bm_handle_t get_handle();
 
 };
-#endif //USE_BM
+#endif //USE_BM_PLACE
 
 #ifdef AMD_GPU
 
diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h
new file mode 100644
index 000000000..b10230dad
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bm_common.h
@@ -0,0 +1,156 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <math.h>
+#include <unistd.h>
+#include <time.h>
+#include "bm_config.h"
+#include "op_code.h"
+#include "bm_memmap.h"
+#include "firmware_core_kernel.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_MESSAGE
+#ifdef DEBUG_MESSAGE
+#define MSG_DBG(fmt, ...)       printf("MSG: "fmt, ##__VA_ARGS__)
+#else
+#define MSG_DBG(fmt, ...)
+#endif
+
+
+#define INLINE                  inline
+
+#define UNUSED(x)               (void)(x)
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define ROUND_UP(A, B)  ((A)/(B) + ((A) % (B) == 0 ? 0 : 1))
+
+#define bm_min(x, y)               ((x) < (y) ? (x) : (y))
+#define bm_max(x, y)               ((x) > (y) ? (x) : (y))
+
+
+typedef unsigned char           u8;
+typedef unsigned short          u16;
+typedef unsigned int            u32;
+typedef unsigned long long      u64;
+
+typedef union {
+  int ival;
+  float fval;
+} IF_VAL;
+
+typedef u32 tuple4_u32[4];
+
+typedef struct tensor_info{
+    u32 n,c,h,w;
+    u32 w_stride, n_stride, c_stride, h_stride;
+    u32 address;
+    u32 data_format;
+    u32 neuron_matrix;		//0: neuron, 1: matrix
+    u32 matrix_col_magin;	//the magin is not 0, when column_num%w_param!=0
+}TENSOR_INFO;
+
+#define FLOAT_SIZE              4
+#define INT8_SIZE               1
+#define FLOAT_BITWIDTH          32
+#define GET_U64(U32_H, U32_L)   (((u64)(U32_H) << 32) | (u64)(U32_L))
+
+typedef enum {
+    CAFFE_SUPPORT             = 0,
+    TENSORFLOW_SUPPORT        = 1
+} PLATFORM_SUPPORT;
+
+typedef enum {
+    NODECHIP_REG    = 0,
+    HOST_REG        = 1
+} REG_TYPE;
+
+typedef struct kernel_param{
+    int g;
+    int oc;
+    int ic;
+    int h;
+    int w;
+} bm_kernel_param_t;
+
+typedef struct bm_conv_param{
+    int stride_h;
+    int stride_w;
+    int pad_h;
+    int pad_w;
+    int dilation_h;
+    int dilation_w;
+    bool result_add;
+} bm_conv_param_t;
+
+typedef struct conv_secs_info{
+    int ocsecs;
+    int icsecs;
+    int nsecs;
+    int hsecs;
+} conv_secs_info_t;
+
+static INLINE int ceiling_func(int numerator, int denominator)
+{
+  return (numerator + denominator - 1) / denominator;
+}
+
+static INLINE int ceiling_func_shift(int numerator, int shift)
+{
+  return (numerator + (1 << shift) - 1) >> shift;
+}
+
+static int INLINE calc_offset(int *shape, int *offset)
+{
+  return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2])
+      * shape[3] + offset[3];
+}
+
+//All the size are in the units of bytes
+static int INLINE get_index_csize_global(int h, int w, int index_bitwidth)
+{
+  int size = h * w * index_bitwidth;
+  //32 bit align
+  return (((size >> 5)) + ((size & 0x1f) != 0)) * FLOAT_SIZE;
+}
+
+static int INLINE get_index_cstride_global(int h, int w, int index_bitwidth)
+{
+  int size = h * w * index_bitwidth;
+  //32 bit align
+  return (((size >> 5)) +
+          ((size & 0x1f) != 0)) * FLOAT_BITWIDTH / index_bitwidth;
+}
+
+static int INLINE get_neuron_csize_local(int h, int w)
+{
+  int size = h * w;
+  //EU_NUM neurons align
+  return ALIGN(size,EU_NUM) * FLOAT_SIZE;
+}
+
+static int INLINE addr_EU_align(int addr){
+  addr = addr / FLOAT_SIZE;
+  return ALIGN( addr, EU_NUM ) * FLOAT_SIZE;
+}
+
+static int INLINE get_cstride_local(int h, int w)
+{
+  int size = h * w;
+  //EU_NUM neurons align
+  return ALIGN(size,EU_NUM);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H */
diff --git a/saber/funcs/impl/bm/device/bm_memmap.h b/saber/funcs/impl/bm/device/bm_memmap.h
new file mode 100644
index 000000000..18d8185e6
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bm_memmap.h
@@ -0,0 +1,61 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H
+
+#define ITCM_MEM_START_ADDR            0x00000000
+#define ITCM_MEM_SIZE                  0x00080000           // 512KB
+#define DTCM_MEM_START_ADDR            0x02000000
+#define DTCM_MEM_SIZE                  0x00010000           // 64KB
+#define SHARE_MEM_START_ADDR           (DTCM_MEM_START_ADDR + DTCM_MEM_SIZE)
+#define SHARE_MEM_SIZE                 0x00010000           // 64KB
+
+#define LOCAL_MEM_ADDRWIDTH            18
+#define LOCAL_MEM_START_ADDR           0x04000000
+//#define LOCAL_MEM_SIZE                 (1<<LOCAL_MEM_ADDRWIDTH)  // 256KB each
+
+#define BDC_RAM_SIZE                   0x00040000    // 512 bit * 4096 entries
+#define BDC_RAM_ADDR                   0x45100000
+
+#define SPI_CTLR_BASE_ADDR_REMAP       0x44000000
+
+#define DDR_CTLR_BASE_ADDR             0x50000000
+
+#define GDMA_ENIGNE_BASE_ADDR          0x60000000
+#define BD_ENIGNE_BASE_ADDR            0x60002000
+#define CDMA_ENIGNE_BASE_ADDR          0x60003000
+#define MINIMAC_BASE_ADDR              0x60005000
+
+#define COUNT_RESERVED_DDR_SWAP        0x1000000
+#define COUNT_RESERVED_DDR_INSTR       0x1000000
+#define COUNT_RESERVED_DDR_IMAGE_SCALE       0x2000000
+
+#define GLOBAL_MEM_START_ADDR_BDC      0x80000000 + COUNT_RESERVED_DDR_INSTR + COUNT_RESERVED_DDR_SWAP
+#define GLOBAL_MEM_START_ADDR_ARM      0x80000000
+
+#define GLOBAL_MEM_BOUNDARY            0x80000000
+#define PCIE_SOC_BASE_DISTANCE         0x100000000
+
+#ifdef SOC_MODE
+  #define GLOBAL_MEM_START_ADDR_CMD      0x0
+  #define GLOBAL_MEM_START_ADDR          0x200000000
+#else
+  #define GLOBAL_MEM_START_ADDR_CMD      0x0
+  #define GLOBAL_MEM_START_ADDR          0x100000000
+#endif
+
+#define GLOBAL_MEM_ALIGN_SIZE          (4)
+
+#define SPI_CTLR_BASE_ADDR             0xFFF00000
+
+#define ITCM_MAP_B_START_ADDR          0x6001A000
+
+#define SFU_TABLE_ADDR_OFFSET          GLOBAL_MEM_START_ADDR_BDC + 0
+
+#define SOFT_RESET_REG_ADDR            0x50008004
+#define GDMA_SOFT_RESET_BIT            5
+#define NPS_SOFT_RESET_BIT             6
+#define CHIPLINK_SOFT_RESET_BIT        7
+
+#define SHARE_REG_BASE_ADDR            (0x50008240)
+
+
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H */
\ No newline at end of file
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c
new file mode 100644
index 000000000..4c890b20e
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bmkernel_base.c
@@ -0,0 +1,28 @@
+#include "bmkernel_base.h"
+#include "bm_config.h"
+#include <stdio.h>
+/**
+ * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications.
+ * 
+ * \param args - Pointer to arguments that user sends from host.
+ *               op - Flag to determine the operation type.             
+ */
+
+int bmkernel_func(void *args)
+{
+    bmkernel_api_base* param = (bmkernel_api_base *)args;
+    switch (param->op) {
+        case ACTIVATION: {
+            // bm_activation_fwd(param)
+            return 0;
+        }
+        case CONV: {
+            // bm_api_conv_forward* api = (bm_api_conv_forward *)param->opParam;
+            return 0;
+        }
+        default: {
+            printf("op %d is not supported by BM yet.\n", param->op);
+            return -1;
+        }
+    }
+}
diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h
new file mode 100644
index 000000000..5a192127c
--- /dev/null
+++ b/saber/funcs/impl/bm/device/bmkernel_base.h
@@ -0,0 +1,47 @@
+#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H
+#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum BmOpType {
+    ACTIVATION, 
+    CONV
+};
+
+typedef struct {
+    unsigned long long             ifmap_offset_global;
+    unsigned long long             ofmap_offset_global;
+    unsigned long long             weight_offset_global;
+    unsigned long long             bias_offset_global;
+    int                            input_n;   // note this is total input_n
+    int                            input_c;
+    int                            input_h;
+    int                            input_w;
+    int                            groups;
+    int                            output_c;
+    int                            kh;
+    int                            kw;
+    int                            dh;
+    int                            dw;
+    int                            pad_h;
+    int                            pad_w;
+    int                            stride_h;
+    int                            stride_w;
+    int                            using_bias;
+    int                            result_add;
+    int                            icsecs;
+    int                            ocsecs;
+    int                            nsecs;
+    int                            hsecs;
+} __attribute__((packed)) bm_api_conv_forward;
+
+typedef struct {
+    enum BmOpType op; // Flag to determine the operation type.
+    void* opParam;
+} __attribute__((packed)) bmkernel_api_base;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H */
diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp
index f1afc3a9b..ac49f4f20 100644
--- a/test/saber/test_saber_buffer.cpp
+++ b/test/saber/test_saber_buffer.cpp
@@ -138,7 +138,7 @@ TEST(TestSaberFunc, test_saber_buffer) {
     test_buffer<ARM, ARM, AK_INT8>();
 #endif
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
     LOG(INFO) << "test BM FP32 buffer";
     //test_buffer<BM, X86, AK_FLOAT>();
 #endif
diff --git a/test/saber/test_saber_context.cpp b/test/saber/test_saber_context.cpp
index db2e7a6f9..b7c41b768 100644
--- a/test/saber/test_saber_context.cpp
+++ b/test/saber/test_saber_context.cpp
@@ -57,12 +57,12 @@ TEST(TestSaberFunc, test_arm_context) {
 }
 #endif //USE_ARM_PLACE
 
-#ifdef USE_BM
+#ifdef USE_BM_PLACE
 TEST(TestSaberFunc, test_BM_context) {
     Context<BM> ctx;
     CHECK_NOTNULL(ctx.get_handle()) << "Failed to get BM handle";
 }
-#endif //USE_BM
+#endif //USE_BM_PLACE
 
 int main(int argc, const char** argv) {
     // initial logger
diff --git a/test/saber/test_saber_conv.cpp b/test/saber/test_saber_conv.cpp
index 52411fbb3..6fa1bede6 100644
--- a/test/saber/test_saber_conv.cpp
+++ b/test/saber/test_saber_conv.cpp
@@ -11,7 +11,6 @@ using namespace anakin::saber;
 #define CHECK_RESULT
 //#define CHECK_SPEED
 
-#if 0
 #ifdef USE_BM_PLACE
 TEST(TestSaberFunc, test_saber_conv_results_bm) {
     Env<BM>::env_init();
@@ -70,7 +69,6 @@ TEST(TestSaberFunc, test_saber_conv_results_bm) {
     }
 }
 #endif
-#endif
 
 TEST(TestSaberFunc, test_saber_conv_results) {
 #ifdef USE_CUDA
diff --git a/test/saber/test_saber_tensor.cpp b/test/saber/test_saber_tensor.cpp
index b4917ec31..f1114f415 100644
--- a/test/saber/test_saber_tensor.cpp
+++ b/test/saber/test_saber_tensor.cpp
@@ -622,7 +622,6 @@ TEST(TestSaberFunc, test_saber_tensor_shape) {
 
 #ifdef USE_BM
     Env<BM>::env_init();
-    Env<X86>::env_init();
     LOG(INFO) << "test BM tensor shape API";
     test_tensor_shape<BM>();
 #endif //USE_BM
@@ -745,7 +744,7 @@ TEST(TestSaberFunc, test_tensor_reshape_realloc) {
     Env<BM>::env_init();
     Env<X86>::env_init();
     LOG(INFO) << "test BM FP32 tensor reshape realloc";
-    //tensor_reshape_realloc<BM, X86, AK_FLOAT>();
+    tensor_reshape_realloc<BM, X86, AK_FLOAT>();
 #endif //USE_BM
 }
 #endif