From 163b964f3bd734095a7b6975e3a34a1cb6496958 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:02:06 -0700
Subject: [PATCH 01/14] CastStorageDnsRsp GPU Implementation

---
 src/operator/tensor/cast_storage-inl.cuh | 453 ++++++++++++++++++-----
 1 file changed, 368 insertions(+), 85 deletions(-)
diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 47fb3b42c356..326e472121f5 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -14,13 +14,288 @@
 namespace mxnet {
 namespace op {
 using mshadow::cuda::kBaseThreadNum;
+using mshadow::Shape1;
+using mxnet_op::Kernel;
 
-inline void CastStorageDnsRspImpl(const OpContext& ctx, const gpu& gpu_dev, const TBlob& dns, NDArray* rsp) {
-  LOG(FATAL) << "CastStorageDnsRspImpl gpu version is not implemented.";
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct MarkRspRowIdxThreadKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param row_flg     row flag array to mark non-zero rows
+   * \param dns         dense matrix data
+   * \param num_rows    number of rows (size of first dimension of tensor)
+   * \param row_length  number of elements per row
+   */
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const DType* dns,
+                                             const index_t num_rows,
+                                             const index_t row_length) {
+    if (tid < num_rows) {
+      index_t j = 0;
+      index_t offset = tid * row_length;
+      for (; j < row_length; ++j) {
+        if (dns[offset+j] != 0) {
+          break;
+        }
+      }
+      if (j < row_length) {
+        row_flg[tid] = 1;  // mark as one for non-zero row
+      } else {
+        row_flg[tid] = 0;  // mark as zero for zero row
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor
+ * Parallelized by tensor rows: 1 warp/row
+ */
+struct MarkRspRowIdxWarpKernel {
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const DType* dns,
+                                             const index_t num_rows,
+                                             const index_t row_length) {
+    typedef cub::WarpReduce<index_t> WarpReduce;
+    const index_t warps_per_block = kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    const index_t warp_id   = tid / 32;          // global warp   id
+    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const index_t lane      = tid & (32-1);      // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      index_t flg = 0;
+      index_t offset = warp_id * row_length;
+      for (index_t j = lane; j < row_length; j+=32) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      index_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
+      if (lane == 0) {
+        if (aggr > 0) {
+          row_flg[warp_id] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[warp_id] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor
+ * Parallelized by tensor rows: 1 threadBlock/row
+ */
+struct MarkRspRowIdxBlockKernel {
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const DType* dns,
+                                             const index_t num_rows,
+                                             const index_t row_length) {
+    typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    if (blockIdx.x < num_rows) {
+      index_t flg = 0;
+      index_t offset = blockIdx.x * row_length;
+      for (index_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      index_t aggr = BlockReduce(temp_storage).Sum(flg);
+      if (threadIdx.x == 0) {
+        if (aggr > 0) {
+          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Kernel for filling the row index array of the rsp tensor
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct FillRspRowIdxKernel {
+  /*!
+   * \brief
+   * \param tid          global thread id
+   * \param row_idx      row index array to store indices of non-zero rows
+   * \param row_flg_sum  inclusive prefix sum array over marked row flag array
+   * \param num_rows     number of rows (size of first dimension of tensor)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_idx,
+                                             const RType* row_flg_sum,
+                                             const index_t num_rows) {
+    if (tid < num_rows) {
+      index_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
+      if (row_flg_sum[tid] > prev) {
+        row_idx[prev] = tid;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Kernel for filling the value array of the rsp tensor
+ * Parallelized by rsp tensor elements: 1 thread/element
+ */
+struct FillRspValsKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param rsp_val     value array of rsp tensor to store data
+   * \param row_idx     indices of non-zero rows
+   * \param dns         dense matrix data
+   * \param nnr         number of non-zero rows
+   * \param row_length  number of elements per row
+   */
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* rsp_val,
+                                             const RType* row_idx,
+                                             const DType* dns,
+                                             const index_t nnr,
+                                             const index_t row_length) {
+    if (tid < nnr*row_length) {
+      const index_t row_id = tid / row_length;
+      const index_t row_el = tid % row_length;
+      const index_t dns_idx = row_idx[row_id] * row_length + row_el;
+      rsp_val[tid] = dns[dns_idx];
+    }
+  }
+};
+
+/*!
+ * \brief GPU implementation of casting a dns tensor to rsp type.
+ */
+inline void CastStorageDnsRspImpl(const OpContext& ctx,
+                                  const gpu& gpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* rsp) {
+  CHECK(rsp != nullptr);
+  CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
+  CHECK_EQ(dns.shape_, rsp->shape());
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
+      const index_t num_rows = dns.shape_[0];
+      const index_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      const index_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+      const index_t threads_per_block = kBaseThreadNum;
+      const index_t min_num_warps = 512;
+      index_t num_threads;
+      // TODO: remove kernel dependency on warpSize=32
+      if (threads_per_warp != 32) {
+        LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
+      }
+      // Determine temporary device storage requirements
+      RType* row_flg = NULL;
+      void* d_temp_storage = NULL;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+
+      // Allocate temp storage for marking non-zero rows and for cub's prefix sum
+      mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+        .get_space_typed<gpu, 1, char>(Shape1(num_rows*sizeof(RType)+temp_storage_bytes), s);
+      row_flg = reinterpret_cast<RType*>(workspace.dptr_);
+      d_temp_storage = workspace.dptr_ + num_rows*sizeof(RType);
+
+      // Mark non-zero rows as 'one' in row_flg
+      // Different kernel versions are optimized for different matrix instances
+      // (1) 'Thread kernel' (one thread       computing one row)
+      // (2) 'Warp kernel'   (one warp         computing one row)
+      // (3) 'Block kernel'  (one thread block computing one row)
+      const int kernel_version = 0;
+      switch (kernel_version) {
+        case 1:
+          num_threads = num_rows;
+          Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        case 2:
+          num_threads = num_rows * threads_per_warp;
+          Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        case 3:
+          num_threads = num_rows * threads_per_block;
+          Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        default:
+          if (row_length < threads_per_warp) {
+            num_threads = num_rows;
+            Kernel<MarkRspRowIdxThreadKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          } else if (row_length < threads_per_block || num_rows > min_num_warps) {
+            num_threads = num_rows * threads_per_warp;
+            Kernel<MarkRspRowIdxWarpKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          } else {
+            num_threads = num_rows * threads_per_block;
+            Kernel<MarkRspRowIdxBlockKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          }
+          break;
+      }
+      // Compute non-zero row indices through inclusive prefix sum
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+
+      // Get total number of non-zero rows from device
+      RType nnr = 0;
+      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(RType), cudaMemcpyDeviceToHost));
+
+      // Allocate rsp tensor row index array and fill
+      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(static_cast<index_t>(nnr)));
+      if (0 == nnr) return;
+      RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
+      num_threads = num_rows;
+      Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+          row_idx, row_flg, num_rows);
+
+      // Construct shape of rsp tensor data, allocate, and fill
+      auto storage_shape = dns.shape_;
+      storage_shape[0] = nnr;
+      rsp->CheckAndAllocData(storage_shape);
+      num_threads = nnr * row_length;
+      Kernel<FillRspValsKernel, gpu>::Launch(s, num_threads,
+          rsp->data().dptr<DType>(), row_idx, dns.dptr<DType>(), nnr, row_length);
+    });
+  });
 }
 
 /*!
- * \brief Thread kernel for initializing the indptr in a csr tensor.
+ * \brief Thread kernel for initializing the indptr in a csr tensor
  * Parallelized by matrix rows: 1 thread/row
  */
 struct FillCsrIndPtrThreadKernel {
@@ -33,15 +308,16 @@ struct FillCsrIndPtrThreadKernel {
    * \param num_cols  number of columns of the dense matrix
    */
   template<typename DType, typename IType>
-  __device__ __forceinline__ static void Map(int tid, IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
     if (tid == 0) {
       indptr[tid] = 0;
     }
     if (tid < num_rows) {
-      int nnz = 0;
-      const int offset = tid * num_cols;
-      for (int j = 0; j < num_cols; ++j) {
+      index_t nnz = 0;
+      const index_t offset = tid * num_cols;
+      for (index_t j = 0; j < num_cols; ++j) {
         if (dns[offset+j] != 0) {
           nnz++;
         }
@@ -67,13 +343,14 @@ struct FillCsrColIdxAndValsThreadKernel {
    * \param num_cols  number of columns of the dense matrix
    */
   template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid, DType* val, CType* col_idx,
-                                  const IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val, CType* col_idx,
+                                             const IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
     if (tid < num_rows) {
-      const int offset = tid * num_cols;
-      int k = indptr[tid];
-      for (int j = 0; j < num_cols; ++j) {
+      const index_t offset = tid * num_cols;
+      index_t k = indptr[tid];
+      for (index_t j = 0; j < num_cols; ++j) {
         if (dns[offset+j] != 0) {
           val[k] = dns[offset+j];
           col_idx[k] = j;
@@ -90,27 +367,28 @@ struct FillCsrColIdxAndValsThreadKernel {
  */
 struct FillCsrIndPtrWarpKernel {
   template<typename DType, typename IType>
-  __device__ __forceinline__ static void Map(int tid, IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
-    typedef cub::WarpReduce<int> WarpReduce;
-    const int warps_per_block = kBaseThreadNum / 32;
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
+    typedef cub::WarpReduce<index_t> WarpReduce;
+    const index_t warps_per_block = kBaseThreadNum / 32;
     __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
 
     if (tid == 0) {
       indptr[tid] = 0;
     }
-    const int warp_id   = tid / 32;          // global warp   id
-    const int warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const int lane      = tid & (32-1);      // local  thread id within warp
+    const index_t warp_id   = tid / 32;          // global warp   id
+    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const index_t lane      = tid & (32-1);      // local  thread id within warp
     if (warp_id < num_rows) {
-      int lane_nnz = 0;
-      const int offset = warp_id * num_cols;
-      for (int j = lane; j < num_cols; j+=32) {
+      index_t lane_nnz = 0;
+      const index_t offset = warp_id * num_cols;
+      for (index_t j = lane; j < num_cols; j+=32) {
         if (dns[offset+j] != 0) {
           lane_nnz++;
         }
       }
-      int aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
+      index_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
       if (lane == 0) {
         indptr[warp_id+1] = aggr;
       }
@@ -124,22 +402,23 @@ struct FillCsrIndPtrWarpKernel {
  */
 struct FillCsrColIdxAndValsWarpKernel {
   template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid, DType* val, CType* col_idx,
-                                  const IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
-    typedef cub::WarpScan<int> WarpScan;
-    const int warps_per_block = kBaseThreadNum / 32;
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val, CType* col_idx,
+                                             const IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
+    typedef cub::WarpScan<index_t> WarpScan;
+    const index_t warps_per_block = kBaseThreadNum / 32;
     __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block];
-    __shared__ volatile int warp_nnz[warps_per_block];
+    __shared__ volatile index_t warp_nnz[warps_per_block];
 
-    const int warp_id   = tid / 32;          // global warp   id
-    const int warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const int lane      = tid & (32-1);      // local  thread id within warp
+    const index_t warp_id   = tid / 32;          // global warp   id
+    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const index_t lane      = tid & (32-1);      // local  thread id within warp
     if (warp_id < num_rows) {
-      const int offset = warp_id * num_cols;
-      int k = indptr[warp_id];
-      int nnz;
-      for (int j = lane; j < num_cols+lane; j+=32) {
+      const index_t offset = warp_id * num_cols;
+      index_t k = indptr[warp_id];
+      index_t nnz;
+      for (index_t j = lane; j < num_cols+lane; j+=32) {
         nnz = 0;
         if (j < num_cols) {
           if (dns[offset+j] != 0) {
@@ -168,28 +447,29 @@ struct FillCsrColIdxAndValsWarpKernel {
 };
 
 /*!
- * \brief Block kernel for initializing the indptr in a csr tensor.
+ * \brief Block kernel for initializing the indptr in a csr tensor
  * Parallelized by matrix rows: 1 threadBlock/row
  */
 struct FillCsrIndPtrBlockKernel {
   template<typename DType, typename IType>
-  __device__ __forceinline__ static void Map(int tid, IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
-    typedef cub::BlockReduce<int, kBaseThreadNum> BlockReduce;
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
+    typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
 
     if (tid == 0) {
       indptr[tid] = 0;
     }
     if (blockIdx.x < num_rows) {
-      int lane_nnz = 0;
-      const int offset = blockIdx.x * num_cols;
-      for (int j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
+      index_t lane_nnz = 0;
+      const index_t offset = blockIdx.x * num_cols;
+      for (index_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
         if (dns[offset+j] != 0) {
           lane_nnz++;
         }
       }
-      int aggr = BlockReduce(temp_storage).Sum(lane_nnz);
+      index_t aggr = BlockReduce(temp_storage).Sum(lane_nnz);
       if (threadIdx.x == 0) {
         indptr[blockIdx.x+1] = aggr;
       }
@@ -203,18 +483,19 @@ struct FillCsrIndPtrBlockKernel {
  */
 struct FillCsrColIdxAndValsBlockKernel {
   template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid, DType* val, CType* col_idx,
-                                  const IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
-    typedef cub::BlockScan<int, kBaseThreadNum> BlockScan;
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val, CType* col_idx,
+                                             const IType* indptr, const DType* dns,
+                                             const index_t num_rows, const index_t num_cols) {
+    typedef cub::BlockScan<index_t, kBaseThreadNum> BlockScan;
     __shared__ typename BlockScan::TempStorage temp_storage;
-    __shared__ volatile int block_nnz;
+    __shared__ volatile index_t block_nnz;
 
     if (blockIdx.x < num_rows) {
-      const int offset = blockIdx.x * num_cols;
-      int k = indptr[blockIdx.x];
-      int nnz;
-      for (int j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
+      const index_t offset = blockIdx.x * num_cols;
+      index_t k = indptr[blockIdx.x];
+      index_t nnz;
+      for (index_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
         nnz = 0;
         if (j < num_cols) {
           if (dns[offset+j] != 0) {
@@ -243,8 +524,7 @@ struct FillCsrColIdxAndValsBlockKernel {
 };
 
 /*!
- * \brief
- * GPU implementation of casting a dense matrix to csr type.
+ * \brief GPU implementation of casting a dense matrix to csr type
  */
 inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                   const gpu& gpu_dev,
@@ -260,12 +540,15 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
       MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {   // col_idx type
         const index_t num_rows = dns.shape_[0];
         const index_t num_cols = dns.shape_[1];
-        const int threads_per_warp  = 32;
-        const int threads_per_block = kBaseThreadNum;
-        const int min_num_warps = 512;
-        int num_threads;
-
-        csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
+        const index_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
+        const index_t threads_per_block = kBaseThreadNum;
+        const index_t min_num_warps = 512;
+        index_t num_threads;
+        // TODO: remove kernel dependency on warpSize=32
+        if (threads_per_warp != 32) {
+          LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32";
+        }
+        csr->CheckAndAllocAuxData(csr::kIndPtr, Shape1(num_rows+1));
         IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
         DType* dns_data = dns.dptr<DType>();
 
@@ -277,32 +560,32 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            mxnet_op::Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            mxnet_op::Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            mxnet_op::Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              mxnet_op::Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
-                indptr, dns_data, num_rows, num_cols);
+              Kernel<FillCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              mxnet_op::Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
-                indptr, dns_data, num_rows, num_cols);
+              Kernel<FillCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              mxnet_op::Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
-                indptr, dns_data, num_rows, num_cols);
+              Kernel<FillCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
             }
             break;
         }
@@ -314,12 +597,12 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                       temp_storage_bytes,
                                       indptr,
                                       indptr,
-                                      static_cast<int>(num_rows+1),
+                                      num_rows+1,
                                       mshadow::Stream<gpu>::GetStream(s));
 
         // Allocate temporary storage
         mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
-          .get_space_typed<gpu, 1, char>(mshadow::Shape1(temp_storage_bytes), s);
+          .get_space_typed<gpu, 1, char>(Shape1(temp_storage_bytes), s);
         d_temp_storage = workspace.dptr_;
 
         // Compute indptr through inclusive prefix sum
@@ -327,7 +610,7 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                       temp_storage_bytes,
                                       indptr,
                                       indptr,
-                                      static_cast<int>(num_rows+1),
+                                      num_rows+1,
                                       mshadow::Stream<gpu>::GetStream(s));
 
         // Receive total number of nnz values from device
@@ -335,43 +618,43 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
 
         // Allocate column index array and data array of the csr matrix
-        csr->CheckAndAllocAuxData(csr::kIdx, mshadow::Shape1(static_cast<index_t>(nnz)));
-        csr->CheckAndAllocData(mshadow::Shape1(static_cast<index_t>(nnz)));
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(nnz)));
+        csr->CheckAndAllocData(Shape1(static_cast<index_t>(nnz)));
 
         // Compute and fill column index array and data array of the csr matrix
         switch (kernel_version) {
           case 1:
             num_threads = num_rows;
-            mxnet_op::Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 2:
             num_threads = num_rows * threads_per_warp;
-            mxnet_op::Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           case 3:
             num_threads = num_rows * threads_per_block;
-            mxnet_op::Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+            Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             break;
           default:
             if (num_cols < threads_per_warp) {
               num_threads = num_rows;
-              mxnet_op::Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
-                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
-                indptr, dns_data, num_rows, num_cols);
+              Kernel<FillCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+                  csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                  indptr, dns_data, num_rows, num_cols);
             } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
               num_threads = num_rows * threads_per_warp;
-              mxnet_op::Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+              Kernel<FillCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             } else {
               num_threads = num_rows * threads_per_block;
-              mxnet_op::Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+              Kernel<FillCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
                 csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
                 indptr, dns_data, num_rows, num_cols);
             }

From 5032c80efd044c98bbbcaa46d2c71ac285d9070d Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:14:40 -0700
Subject: [PATCH 02/14] updating function doc and some variable types and names

---
 src/operator/tensor/cast_storage-inl.cuh | 24 ++++++-------
 src/operator/tensor/cast_storage-inl.h   | 45 +++++++++++-------------
 2 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 326e472121f5..c1043e4e024c 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -18,7 +18,7 @@ using mshadow::Shape1;
 using mxnet_op::Kernel;
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor
+ * \brief Thread kernel for marking non-zero rows of a tensor.
  * Parallelized by tensor rows: 1 thread/row
  */
 struct MarkRspRowIdxThreadKernel {
@@ -54,7 +54,7 @@ struct MarkRspRowIdxThreadKernel {
 };
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor
+ * \brief Thread kernel for marking non-zero rows of a tensor.
  * Parallelized by tensor rows: 1 warp/row
  */
 struct MarkRspRowIdxWarpKernel {
@@ -95,7 +95,7 @@ struct MarkRspRowIdxWarpKernel {
 };
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor
+ * \brief Thread kernel for marking non-zero rows of a tensor.
  * Parallelized by tensor rows: 1 threadBlock/row
  */
 struct MarkRspRowIdxBlockKernel {
@@ -130,7 +130,7 @@ struct MarkRspRowIdxBlockKernel {
 };
 
 /*!
- * \brief Kernel for filling the row index array of the rsp tensor
+ * \brief Kernel for filling the row index array of the rsp tensor.
  * Parallelized by tensor rows: 1 thread/row
  */
 struct FillRspRowIdxKernel {
@@ -156,7 +156,7 @@ struct FillRspRowIdxKernel {
 };
 
 /*!
- * \brief Kernel for filling the value array of the rsp tensor
+ * \brief Kernel for filling the value array of the rsp tensor.
  * Parallelized by rsp tensor elements: 1 thread/element
  */
 struct FillRspValsKernel {
@@ -295,7 +295,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
 }
 
 /*!
- * \brief Thread kernel for initializing the indptr in a csr tensor
+ * \brief Thread kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
 struct FillCsrIndPtrThreadKernel {
@@ -328,7 +328,7 @@ struct FillCsrIndPtrThreadKernel {
 };
 
 /*!
- * \brief Thread kernel for initializing the col_idx and value array of the csr matrix
+ * \brief Thread kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 thread/row
  */
 struct FillCsrColIdxAndValsThreadKernel {
@@ -362,7 +362,7 @@ struct FillCsrColIdxAndValsThreadKernel {
 };
 
 /*!
- * \brief Warp kernel for initializing the indptr in a csr matrix
+ * \brief Warp kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
 struct FillCsrIndPtrWarpKernel {
@@ -397,7 +397,7 @@ struct FillCsrIndPtrWarpKernel {
 };
 
 /*!
- * \brief Warp kernel for initializing the col_idx and value array of the csr matrix
+ * \brief Warp kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 warp/row
  */
 struct FillCsrColIdxAndValsWarpKernel {
@@ -447,7 +447,7 @@ struct FillCsrColIdxAndValsWarpKernel {
 };
 
 /*!
- * \brief Block kernel for initializing the indptr in a csr tensor
+ * \brief Block kernel for initializing the indptr in a csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
 struct FillCsrIndPtrBlockKernel {
@@ -478,7 +478,7 @@ struct FillCsrIndPtrBlockKernel {
 };
 
 /*!
- * \brief Block kernel for initializing the col_idx and value array of the csr matrix
+ * \brief Block kernel for initializing the col_idx and value array of the csr matrix.
  * Parallelized by matrix rows: 1 threadBlock/row
  */
 struct FillCsrColIdxAndValsBlockKernel {
@@ -524,7 +524,7 @@ struct FillCsrColIdxAndValsBlockKernel {
 };
 
 /*!
- * \brief GPU implementation of casting a dense matrix to csr type
+ * \brief GPU implementation of casting a dense matrix to csr type.
  */
 inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                   const gpu& gpu_dev,
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index 46ae105b80b6..fb9caf9be77c 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -20,21 +20,21 @@ namespace mxnet {
 namespace op {
 
 /*!
- * \brief Kernel for marking row_idx of a RSP matrix per row
+ * \brief CPU Kernel for marking row_idx of a RSP tensor per row.
  */
 struct MarkRspRowIdx {
-  // i represents the row index of the matrix data
+  // i represents the row index of the tensor data
   template<typename DType, typename RType>
   MSHADOW_XINLINE static void Map(int i, RType* row_idx, const DType* data,
-                                  const index_t num_cols) {
+                                  const index_t row_length) {
     index_t j = 0;
-    index_t offset = i * num_cols;
-    for (; j < num_cols; ++j) {
+    index_t offset = i * row_length;
+    for (; j < row_length; ++j) {
       if (data[offset+j] != 0) {
         break;
       }
     }
-    if (num_cols == j) {
+    if (row_length == j) {
       row_idx[i] = 0;  // mark as zero for zero row
     } else {
       row_idx[i] = 1;  // mark as one for non-zero row
@@ -43,8 +43,7 @@ struct MarkRspRowIdx {
 };
 
 /*!
- * \brief
- * CPU implementation of casting a dns tensor to rsp type.
+ * \brief CPU implementation of casting a dns tensor to rsp type.
  */
 inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                   const cpu& cpu_dev,
@@ -89,19 +88,19 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
 // TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
 struct CastStorageRspDnsKernel {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, const index_t width, const IType* idx, const DType *data,
+  MSHADOW_XINLINE static void Map(int i, const index_t row_length, const IType* idx, const DType *data,
                                   DType* dns) {
     auto rid = idx[i];
-    auto dns_offset = rid * width;
-    auto rsp_offset = i * width;
-    for (size_t col = 0; col < width; col++) {
+    auto dns_offset = rid * row_length;
+    auto rsp_offset = i * row_length;
+    for (size_t col = 0; col < row_length; col++) {
       dns[dns_offset + col] = data[rsp_offset + col];
     }
   }
 };
 
 /*!
- * \brief This function assumes that the meomry for dns has been allocated already
+ * \brief This function assumes that the memory for dns has been allocated already
  * since the shape is known at binding stage.
  */
 template<typename xpu>
@@ -128,7 +127,7 @@ void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns)
 }
 
 /*!
- * \brief This is the kernel for initializing the indptr in a csr tensor.
+ * \brief CPU kernel for initializing the indptr in a csr matrix.
  */
 struct FillCsrIndPtr {
   /*!
@@ -153,8 +152,7 @@ struct FillCsrIndPtr {
 };
 
 /*!
- * \brief This is the kernel for initializing the col_idx and value array
- * of the csr tensor
+ * \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
  */
 struct FillCsrColIdxAndVals {
   /*!
@@ -170,10 +168,10 @@ struct FillCsrColIdxAndVals {
   template<typename DType, typename IType, typename CType>
   MSHADOW_XINLINE static void Map(int i, DType* val, CType* col_idx,
                                   const IType* indptr, const DType* dns,
-                                  const int num_rows, const int num_cols) {
-    const int offset = i * num_cols;
-    int k = indptr[i];
-    for (int j = 0; j < num_cols; ++j) {
+                                  const index_t num_rows, const index_t num_cols) {
+    const index_t offset = i * num_cols;
+    IType k = indptr[i];
+    for (index_t j = 0; j < num_cols; ++j) {
       if (dns[offset+j] != 0) {
         val[k] = dns[offset+j];
         col_idx[k] = j;
@@ -184,8 +182,7 @@ struct FillCsrColIdxAndVals {
 };
 
 /*!
- * \brief
- * CPU implementation of casting a dns tensor to csr type.
+ * \brief CPU implementation of casting a dns matrix to csr type.
  */
 inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                   const cpu& cpu_dev,
@@ -226,7 +223,7 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
 }
 
 /*!
- * \brief This is the kernel for copying csr.data to its corresponding dns tensor.
+ * \brief This is the kernel for copying csr.data to its corresponding dns matrix.
  */
 struct CopyCsrDataToDns {
   /*!
@@ -250,7 +247,7 @@ struct CopyCsrDataToDns {
 };
 
 /*!
- * \brief Casts a csr tensor to dns format.
+ * \brief Casts a csr matrix to dns format.
  */
 template<typename xpu>
 void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns) {

From 49c6f27a2619c2976fb3ab7cb8131e2fa6b4e65d Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:24:08 -0700
Subject: [PATCH 03/14] adding cuda_get_device_prop() util function

---
 src/operator/mxnet_op.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index d4a473c8be0c..694c50c4fe05 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -10,6 +10,9 @@
 #include <dmlc/omp.h>
 #include <mxnet/base.h>
 #include <algorithm>
+#ifdef __CUDACC__
+#include "../common/cuda_utils.h"
+#endif  // __CUDACC__
 
 namespace mxnet {
 namespace op {
@@ -32,6 +35,13 @@ int get_num_threads(const int N);
       i < (n); \
       i += blockDim.x * gridDim.x)
 
+inline cudaDeviceProp cuda_get_device_prop() {
+  int device;
+  CUDA_CALL(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
+  return deviceProp;
+}
 
 /*!
  * \brief Get the number of blocks for cuda kernel given N

From db35454b5946fde665e15e08d9d0ca42b3873ed9 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:29:07 -0700
Subject: [PATCH 04/14] added rand_shape function for n-dimensional tensors

---
 python/mxnet/test_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 06ef759c53d0..306dea04faf0 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -110,9 +110,15 @@ def rand_ndarray(shape, stype, density=None):
 def rand_shape_2d(dim0=10, dim1=10):
     return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
 
+
 def rand_shape_3d(dim0=10, dim1=10, dim2=10):
     return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
 
+
+def rand_shape_nd(n, dim=10):
+  return rnd.randint(1, dim+1, size=n)
+
+
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
 

From ec7ea7e831d12ca6dfa2a237400fbd5901163581 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:35:19 -0700
Subject: [PATCH 05/14] updated cast storage unit test

---
 tests/python/unittest/test_sparse_operator.py | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 23fbb8fa2b05..c5a31dca2ffd 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -69,30 +69,24 @@ def test_elemwise_add_ex_multiple_stages():
 
 # TODO(haibin) also add test for backward pass.
 def test_cast_storage_ex():
-    def test_rsp_to_dns(shape):
-        rsp, (data, row_idx) = rand_sparse_ndarray(shape, 'row_sparse')
-        dns_out = mx.nd.cast_storage(rsp, stype='default')
-        dns_expected = np.zeros(shape, dtype=default_dtype())
-        if row_idx is not None:
-            for k, v in enumerate(row_idx):
-                dns_expected[v, :] = data[k]
-        assert same(dns_out.asnumpy(), dns_expected)
-
-    def test_dns_to_rsp(shape):
-        dns_in = rand_ndarray(shape, 'default')
-        rsp_out = mx.nd.cast_storage(mx.nd.array(dns_in, dtype=default_dtype()), stype='row_sparse')
-        ret = mx.nd.cast_storage(rsp_out, stype='default')
-        assert same(ret.asnumpy(), dns_in.asnumpy())
+    def test_rsp_to_dns(shape, density):
+        rsp_in, (data, row_idx) = rand_sparse_ndarray(shape, 'row_sparse', density)
+        dns_out = mx.nd.cast_storage(rsp_in, stype='default')
+        assert same(rsp_in.asnumpy(), dns_out.asnumpy())
+
+    def test_dns_to_rsp(shape, density):
+        rsp_in, (data, row_idx) = rand_sparse_ndarray(shape, 'row_sparse', density)
+        rsp_out = mx.nd.cast_storage(mx.nd.array(rsp_in.todense(), dtype=default_dtype()), stype='row_sparse')
+        assert same(rsp_in.asnumpy(), rsp_out.asnumpy())
 
     def test_csr_to_dns(shape, density):
         csr_in, (indptr, indices, values) = rand_sparse_ndarray(shape, 'csr', density)
-        dns_out = csr_in.todense()
+        dns_out = mx.nd.cast_storage(csr_in, stype='default')
         assert same(csr_in.asnumpy(), dns_out.asnumpy())
 
     def test_dns_to_csr(shape, density):
         csr_in, (indptr, colidx, data) = rand_sparse_ndarray(shape, 'csr', density)
-        dns_in = csr_in.todense()
-        csr_out = mx.nd.cast_storage(mx.nd.array(dns_in, dtype=default_dtype()), stype='csr')
+        csr_out = mx.nd.cast_storage(mx.nd.array(csr_in.todense(), dtype=default_dtype()), stype='csr')
         assert same(csr_in.asnumpy(), csr_out.asnumpy())
 
     shape = rand_shape_2d()
@@ -102,10 +96,27 @@ def test_dns_to_csr(shape, density):
 
     density = [1.00, 0.50, 0.10, 0.05, 0.01]
     for d in density:
-        test_csr_to_dns((rnd.randint(1, 10), rnd.randint(  1,   64)), d)
-        test_dns_to_csr((rnd.randint(1, 10), rnd.randint(  1,   31)), d) # test gpu thread kernel
-        test_dns_to_csr((rnd.randint(1, 10), rnd.randint( 32,  512)), d) # test gpu warp   kernel
-        test_dns_to_csr((rnd.randint(1, 10), rnd.randint(513, 1024)), d) # test gpu block  kernel
+        shape_2d = rand_shape_2d()
+        shape_3d = rand_shape_3d()
+        test_csr_to_dns(shape_2d, d)
+        test_dns_to_csr(shape_2d, d)
+        test_rsp_to_dns(shape_2d, d)
+        test_dns_to_rsp(shape_2d, d)
+        test_rsp_to_dns(shape_3d, d)
+        test_dns_to_rsp(shape_3d, d)
+        for i in range(4, 6):
+            shape = rand_shape_nd(i, 5)
+            test_dns_to_rsp(shape, d)
+            test_rsp_to_dns(shape, d)
+        # Test specific gpu kernels
+        if default_context().device_type is 'gpu':
+            test_dns_to_csr((rnd.randint(1, 10), rnd.randint(  1,   32)), d) # test gpu thread kernel
+            test_dns_to_csr((rnd.randint(1, 10), rnd.randint( 32,  512)), d) # test gpu warp   kernel
+            test_dns_to_csr((rnd.randint(1, 10), rnd.randint(512, 1024)), d) # test gpu block  kernel
+            test_dns_to_rsp((rnd.randint(1, 10), rnd.randint(  1,   32)), d) # test gpu thread kernel
+            test_dns_to_rsp((rnd.randint(1, 10), rnd.randint( 32,  512)), d) # test gpu warp   kernel
+            test_dns_to_rsp((rnd.randint(1, 10), rnd.randint(512, 1024)), d) # test gpu block  kernel
+
 
 
 def test_sparse_dot():

From 0f3436c8963acc632b7d92e1a14eeacdefdfbb5a Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:39:43 -0700
Subject: [PATCH 06/14] added dns_to_rsp to cast storage benchmark script

---
 benchmark/python/cast_storage.py | 39 ++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py
index a92da7295d2b..c34cd10eb3a2 100644
--- a/benchmark/python/cast_storage.py
+++ b/benchmark/python/cast_storage.py
@@ -23,17 +23,17 @@ def measure_cost(repeat, f, *args, **kwargs):
 
 
 def run_cast_storage_synthetic():
-    def dns_to_csr(m, n, density, ctx, repeat):
+    def dense_to_sparse(m, n, density, ctx, repeat, storage_type):
         set_default_context(ctx)
         data_shape = (m, n)
-        dns_data = rand_ndarray(data_shape, 'csr', density).todense()
+        dns_data = rand_ndarray(data_shape, storage_type, density).todense()
         dns_data.wait_to_read()
 
         # do one warm up run, verify correctness
-        assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())
+        assert same(mx.nd.cast_storage(dns_data, stype=storage_type).asnumpy(), dns_data.asnumpy())
 
         # start benchmarking
-        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
+        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype=storage_type)
         results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
         print(results)
 
@@ -46,24 +46,35 @@ def dns_to_csr(m, n, density, ctx, repeat):
     # num_repeat  number of benchmark runs to average over
     # contexts    mx.cpu(), mx.gpu()
     #             note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
+    # benchmarks  dns_to_csr, dns_to_rsp
     m = [  512,    512]
     n = [50000, 100000]
     density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
     num_repeat = 10
     contexts = [mx.gpu()]
+    benchmarks = ["dns_to_csr", "dns_to_rsp"]
 
     # run benchmark
-    print("==================================================")
-    print(" cast_storage benchmark: dense to csr, size m x n ")
-    print("==================================================")
-    headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
-    print(headline)
-    for i in range(len(n)):
-        for ctx in contexts:
-            for den in density:
-                dns_to_csr(m[i], n[i], den, ctx, num_repeat)
+    for b in benchmarks:
+        stype = ''
+        print("==================================================")
+        if b is "dns_to_csr":
+            stype = 'csr'
+            print(" cast_storage benchmark: dense to csr, size m x n ")
+        elif b is "dns_to_rsp":
+            stype = 'row_sparse'
+            print(" cast_storage benchmark: dense to rsp, size m x n ")
+        else:
+            print("invalid benchmark: b, continue")
+        print("==================================================")
+        headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
+        print(headline)
+        for i in range(len(n)):
+            for ctx in contexts:
+                for den in density:
+                    dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
+            print("")
         print("")
-    print("==================================================")
 
 
 if __name__ == "__main__":

From 32555552134b554d4cc2148ca453891df432be6b Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 11:54:41 -0700
Subject: [PATCH 07/14] removing redundant unit test

---
 tests/python/unittest/test_sparse_operator.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index c5a31dca2ffd..2d073ac4e7a5 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -89,11 +89,6 @@ def test_dns_to_csr(shape, density):
         csr_out = mx.nd.cast_storage(mx.nd.array(csr_in.todense(), dtype=default_dtype()), stype='csr')
         assert same(csr_in.asnumpy(), csr_out.asnumpy())
 
-    shape = rand_shape_2d()
-    if default_context().device_type is 'cpu':
-        test_rsp_to_dns(shape)
-        test_dns_to_rsp(shape)
-
     density = [1.00, 0.50, 0.10, 0.05, 0.01]
     for d in density:
         shape_2d = rand_shape_2d()

From 824aedf4eb6ccc4bf2ea41e376e4e939132788b7 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 13:17:39 -0700
Subject: [PATCH 08/14] fix lint

---
 src/operator/tensor/cast_storage-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index fb9caf9be77c..2ad1957a4648 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -88,8 +88,8 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
 // TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
 struct CastStorageRspDnsKernel {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, const index_t row_length, const IType* idx, const DType *data,
-                                  DType* dns) {
+  MSHADOW_XINLINE static void Map(int i, const index_t row_length, const IType* idx,
+                                  const DType *data, DType* dns) {
     auto rid = idx[i];
     auto dns_offset = rid * row_length;
     auto rsp_offset = i * row_length;

From fb2abc83017693ad3a8dabea5a8c43e1889080f8 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 13:32:20 -0700
Subject: [PATCH 09/14] minor change in benchmark script

---
 benchmark/python/cast_storage.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py
index c34cd10eb3a2..51356217dadf 100644
--- a/benchmark/python/cast_storage.py
+++ b/benchmark/python/cast_storage.py
@@ -65,7 +65,8 @@ def dense_to_sparse(m, n, density, ctx, repeat, storage_type):
             stype = 'row_sparse'
             print(" cast_storage benchmark: dense to rsp, size m x n ")
         else:
-            print("invalid benchmark: b, continue")
+            print("invalid benchmark: %s" %b)
+            continue
         print("==================================================")
         headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
         print(headline)

From b20136deb43128dc3ffc4e52f190e4969f8d8836 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Thu, 27 Jul 2017 14:05:11 -0700
Subject: [PATCH 10/14] fix lint

---
 python/mxnet/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 306dea04faf0..ca1e8415f7ee 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -116,7 +116,7 @@ def rand_shape_3d(dim0=10, dim1=10, dim2=10):
 
 
 def rand_shape_nd(n, dim=10):
-  return rnd.randint(1, dim+1, size=n)
+    return rnd.randint(1, dim+1, size=n)
 
 
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):

From 581ce64c9faae2c76aa85625359062d89e0c3f3c Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Fri, 28 Jul 2017 11:26:23 -0700
Subject: [PATCH 11/14] correct function description

---
 src/operator/tensor/cast_storage-inl.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index c1043e4e024c..448cbd3edc50 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -54,7 +54,7 @@ struct MarkRspRowIdxThreadKernel {
 };
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor.
+ * \brief Warp kernel for marking non-zero rows of a tensor.
  * Parallelized by tensor rows: 1 warp/row
  */
 struct MarkRspRowIdxWarpKernel {
@@ -95,7 +95,7 @@ struct MarkRspRowIdxWarpKernel {
 };
 
 /*!
- * \brief Thread kernel for marking non-zero rows of a tensor.
+ * \brief Block kernel for marking non-zero rows of a tensor.
  * Parallelized by tensor rows: 1 threadBlock/row
  */
 struct MarkRspRowIdxBlockKernel {

From 1b46f2183d614c166f1849f7690894ed0943bf65 Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Fri, 28 Jul 2017 16:33:56 -0700
Subject: [PATCH 12/14] change storage_type to stype

---
 benchmark/python/cast_storage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py
index 51356217dadf..38398e5e164a 100644
--- a/benchmark/python/cast_storage.py
+++ b/benchmark/python/cast_storage.py
@@ -23,17 +23,17 @@ def measure_cost(repeat, f, *args, **kwargs):
 
 
 def run_cast_storage_synthetic():
-    def dense_to_sparse(m, n, density, ctx, repeat, storage_type):
+    def dense_to_sparse(m, n, density, ctx, repeat, stype):
         set_default_context(ctx)
         data_shape = (m, n)
-        dns_data = rand_ndarray(data_shape, storage_type, density).todense()
+        dns_data = rand_ndarray(data_shape, stype, density).todense()
         dns_data.wait_to_read()
 
         # do one warm up run, verify correctness
-        assert same(mx.nd.cast_storage(dns_data, stype=storage_type).asnumpy(), dns_data.asnumpy())
+        assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())
 
         # start benchmarking
-        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype=storage_type)
+        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
         results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
         print(results)
 

From 6cdf419de3efc4e8ae2e08662424c9ad14d3810f Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Fri, 28 Jul 2017 17:02:02 -0700
Subject: [PATCH 13/14] changed scope of using namespaces

---
 src/operator/tensor/cast_storage-inl.cuh | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 448cbd3edc50..54ebe608c6ce 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -13,9 +13,6 @@
 
 namespace mxnet {
 namespace op {
-using mshadow::cuda::kBaseThreadNum;
-using mshadow::Shape1;
-using mxnet_op::Kernel;
 
 /*!
  * \brief Thread kernel for marking non-zero rows of a tensor.
@@ -65,7 +62,7 @@ struct MarkRspRowIdxWarpKernel {
                                              const index_t num_rows,
                                              const index_t row_length) {
     typedef cub::WarpReduce<index_t> WarpReduce;
-    const index_t warps_per_block = kBaseThreadNum / 32;
+    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
 
     const index_t warp_id   = tid / 32;          // global warp   id
@@ -105,6 +102,7 @@ struct MarkRspRowIdxBlockKernel {
                                              const DType* dns,
                                              const index_t num_rows,
                                              const index_t row_length) {
+    using mshadow::cuda::kBaseThreadNum;
     typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     if (blockIdx.x < num_rows) {
@@ -195,13 +193,15 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
   CHECK(rsp != nullptr);
   CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
   CHECK_EQ(dns.shape_, rsp->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
       const index_t num_rows = dns.shape_[0];
       const index_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
       const index_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
-      const index_t threads_per_block = kBaseThreadNum;
+      const index_t threads_per_block = mshadow::cuda::kBaseThreadNum;
       const index_t min_num_warps = 512;
       index_t num_threads;
       // TODO: remove kernel dependency on warpSize=32
@@ -371,7 +371,7 @@ struct FillCsrIndPtrWarpKernel {
                                              IType* indptr, const DType* dns,
                                              const index_t num_rows, const index_t num_cols) {
     typedef cub::WarpReduce<index_t> WarpReduce;
-    const index_t warps_per_block = kBaseThreadNum / 32;
+    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
 
     if (tid == 0) {
@@ -407,7 +407,7 @@ struct FillCsrColIdxAndValsWarpKernel {
                                              const IType* indptr, const DType* dns,
                                              const index_t num_rows, const index_t num_cols) {
     typedef cub::WarpScan<index_t> WarpScan;
-    const index_t warps_per_block = kBaseThreadNum / 32;
+    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block];
     __shared__ volatile index_t warp_nnz[warps_per_block];
 
@@ -455,6 +455,7 @@ struct FillCsrIndPtrBlockKernel {
   __device__ __forceinline__ static void Map(int tid,
                                              IType* indptr, const DType* dns,
                                              const index_t num_rows, const index_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
     typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
 
@@ -487,6 +488,7 @@ struct FillCsrColIdxAndValsBlockKernel {
                                              DType* val, CType* col_idx,
                                              const IType* indptr, const DType* dns,
                                              const index_t num_rows, const index_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
     typedef cub::BlockScan<index_t, kBaseThreadNum> BlockScan;
     __shared__ typename BlockScan::TempStorage temp_storage;
     __shared__ volatile index_t block_nnz;
@@ -534,6 +536,8 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
   CHECK_EQ(csr->storage_type(), kCSRStorage);
   CHECK_EQ(dns.shape_.ndim(), 2);
   CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {                     // data type
     MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
@@ -541,7 +545,7 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         const index_t num_rows = dns.shape_[0];
         const index_t num_cols = dns.shape_[1];
         const index_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
-        const index_t threads_per_block = kBaseThreadNum;
+        const index_t threads_per_block = mshadow::cuda::kBaseThreadNum;
         const index_t min_num_warps = 512;
         index_t num_threads;
         // TODO: remove kernel dependency on warpSize=32

From 3bb50b41587de3d42bbe33b28c2df40c2a18eecb Mon Sep 17 00:00:00 2001
From: Henneking <stefan.henneking@gmail.com>
Date: Mon, 31 Jul 2017 12:24:32 -0700
Subject: [PATCH 14/14] changed variable types from index_t to dim_t

---
 src/operator/tensor/cast_storage-inl.cuh | 218 +++++++++++++----------
 1 file changed, 123 insertions(+), 95 deletions(-)

diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 54ebe608c6ce..dfb09d632ac8 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -8,6 +8,7 @@
 
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
+#include <nnvm/tuple.h>
 
 #include <cub/cub.cuh>
 
@@ -31,11 +32,12 @@ struct MarkRspRowIdxThreadKernel {
   __device__ __forceinline__ static void Map(int tid,
                                              RType* row_flg,
                                              const DType* dns,
-                                             const index_t num_rows,
-                                             const index_t row_length) {
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
     if (tid < num_rows) {
-      index_t j = 0;
-      index_t offset = tid * row_length;
+      dim_t j = 0;
+      dim_t offset = tid * row_length;
       for (; j < row_length; ++j) {
         if (dns[offset+j] != 0) {
           break;
@@ -59,27 +61,28 @@ struct MarkRspRowIdxWarpKernel {
   __device__ __forceinline__ static void Map(int tid,
                                              RType* row_flg,
                                              const DType* dns,
-                                             const index_t num_rows,
-                                             const index_t row_length) {
-    typedef cub::WarpReduce<index_t> WarpReduce;
-    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
 
-    const index_t warp_id   = tid / 32;          // global warp   id
-    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const index_t lane      = tid & (32-1);      // local  thread id within warp
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
 
     if (warp_id < num_rows) {
-      index_t flg = 0;
-      index_t offset = warp_id * row_length;
-      for (index_t j = lane; j < row_length; j+=32) {
+      dim_t flg = 0;
+      dim_t offset = warp_id * row_length;
+      for (dim_t j = lane; j < row_length; j+=32) {
         if (dns[offset+j] != 0) {
           // avoid break: causes slower performance on sparse tensors (<20% density),
           // due to thread divergence
           flg++;
         }
       }
-      index_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
       if (lane == 0) {
         if (aggr > 0) {
           row_flg[warp_id] = 1;  // mark as one for non-zero row
@@ -100,22 +103,23 @@ struct MarkRspRowIdxBlockKernel {
   __device__ __forceinline__ static void Map(int tid,
                                              RType* row_flg,
                                              const DType* dns,
-                                             const index_t num_rows,
-                                             const index_t row_length) {
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
     using mshadow::cuda::kBaseThreadNum;
-    typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     if (blockIdx.x < num_rows) {
-      index_t flg = 0;
-      index_t offset = blockIdx.x * row_length;
-      for (index_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
+      dim_t flg = 0;
+      dim_t offset = blockIdx.x * row_length;
+      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
         if (dns[offset+j] != 0) {
           // avoid break: causes slower performance on sparse tensors (<20% density),
           // due to thread divergence
           flg++;
         }
       }
-      index_t aggr = BlockReduce(temp_storage).Sum(flg);
+      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
       if (threadIdx.x == 0) {
         if (aggr > 0) {
           row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
@@ -143,9 +147,9 @@ struct FillRspRowIdxKernel {
   __device__ __forceinline__ static void Map(int tid,
                                              RType* row_idx,
                                              const RType* row_flg_sum,
-                                             const index_t num_rows) {
+                                             const nnvm::dim_t num_rows) {
     if (tid < num_rows) {
-      index_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
+      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
       if (row_flg_sum[tid] > prev) {
         row_idx[prev] = tid;
       }
@@ -172,12 +176,13 @@ struct FillRspValsKernel {
                                              DType* rsp_val,
                                              const RType* row_idx,
                                              const DType* dns,
-                                             const index_t nnr,
-                                             const index_t row_length) {
+                                             const nnvm::dim_t nnr,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
     if (tid < nnr*row_length) {
-      const index_t row_id = tid / row_length;
-      const index_t row_el = tid % row_length;
-      const index_t dns_idx = row_idx[row_id] * row_length + row_el;
+      const dim_t row_id = tid / row_length;
+      const dim_t row_el = tid % row_length;
+      const dim_t dns_idx = row_idx[row_id] * row_length + row_el;
       rsp_val[tid] = dns[dns_idx];
     }
   }
@@ -195,15 +200,16 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
   CHECK_EQ(dns.shape_, rsp->shape());
   using mshadow::Shape1;
   using mxnet_op::Kernel;
+  using nnvm::dim_t;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
-      const index_t num_rows = dns.shape_[0];
-      const index_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
-      const index_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
-      const index_t threads_per_block = mshadow::cuda::kBaseThreadNum;
-      const index_t min_num_warps = 512;
-      index_t num_threads;
+      const dim_t num_rows = dns.shape_[0];
+      const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+      const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+      const dim_t min_num_warps = 512;
+      dim_t num_threads;
       // TODO: remove kernel dependency on warpSize=32
       if (threads_per_warp != 32) {
         LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
@@ -276,7 +282,7 @@ inline void CastStorageDnsRspImpl(const OpContext& ctx,
       CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(RType), cudaMemcpyDeviceToHost));
 
       // Allocate rsp tensor row index array and fill
-      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(static_cast<index_t>(nnr)));
+      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(static_cast<dim_t>(nnr)));
       if (0 == nnr) return;
       RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
       num_threads = num_rows;
@@ -309,15 +315,18 @@ struct FillCsrIndPtrThreadKernel {
    */
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
-                                             IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
     if (tid == 0) {
       indptr[tid] = 0;
     }
     if (tid < num_rows) {
-      index_t nnz = 0;
-      const index_t offset = tid * num_cols;
-      for (index_t j = 0; j < num_cols; ++j) {
+      dim_t nnz = 0;
+      const dim_t offset = tid * num_cols;
+      for (dim_t j = 0; j < num_cols; ++j) {
         if (dns[offset+j] != 0) {
           nnz++;
         }
@@ -344,13 +353,17 @@ struct FillCsrColIdxAndValsThreadKernel {
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
-                                             DType* val, CType* col_idx,
-                                             const IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
     if (tid < num_rows) {
-      const index_t offset = tid * num_cols;
-      index_t k = indptr[tid];
-      for (index_t j = 0; j < num_cols; ++j) {
+      const dim_t offset = tid * num_cols;
+      dim_t k = indptr[tid];
+      for (dim_t j = 0; j < num_cols; ++j) {
         if (dns[offset+j] != 0) {
           val[k] = dns[offset+j];
           col_idx[k] = j;
@@ -368,27 +381,30 @@ struct FillCsrColIdxAndValsThreadKernel {
 struct FillCsrIndPtrWarpKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
-                                             IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
-    typedef cub::WarpReduce<index_t> WarpReduce;
-    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
 
     if (tid == 0) {
       indptr[tid] = 0;
     }
-    const index_t warp_id   = tid / 32;          // global warp   id
-    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const index_t lane      = tid & (32-1);      // local  thread id within warp
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
     if (warp_id < num_rows) {
-      index_t lane_nnz = 0;
-      const index_t offset = warp_id * num_cols;
-      for (index_t j = lane; j < num_cols; j+=32) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = warp_id * num_cols;
+      for (dim_t j = lane; j < num_cols; j+=32) {
         if (dns[offset+j] != 0) {
           lane_nnz++;
         }
       }
-      index_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
       if (lane == 0) {
         indptr[warp_id+1] = aggr;
       }
@@ -403,22 +419,26 @@ struct FillCsrIndPtrWarpKernel {
 struct FillCsrColIdxAndValsWarpKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
-                                             DType* val, CType* col_idx,
-                                             const IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
-    typedef cub::WarpScan<index_t> WarpScan;
-    const index_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpScan<dim_t> WarpScan;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
     __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block];
-    __shared__ volatile index_t warp_nnz[warps_per_block];
+    __shared__ volatile dim_t warp_nnz[warps_per_block];
 
-    const index_t warp_id   = tid / 32;          // global warp   id
-    const index_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
-    const index_t lane      = tid & (32-1);      // local  thread id within warp
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
     if (warp_id < num_rows) {
-      const index_t offset = warp_id * num_cols;
-      index_t k = indptr[warp_id];
-      index_t nnz;
-      for (index_t j = lane; j < num_cols+lane; j+=32) {
+      const dim_t offset = warp_id * num_cols;
+      dim_t k = indptr[warp_id];
+      dim_t nnz;
+      for (dim_t j = lane; j < num_cols+lane; j+=32) {
         nnz = 0;
         if (j < num_cols) {
           if (dns[offset+j] != 0) {
@@ -453,24 +473,27 @@ struct FillCsrColIdxAndValsWarpKernel {
 struct FillCsrIndPtrBlockKernel {
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
-                                             IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
     using mshadow::cuda::kBaseThreadNum;
-    typedef cub::BlockReduce<index_t, kBaseThreadNum> BlockReduce;
+    using nnvm::dim_t;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
 
     if (tid == 0) {
       indptr[tid] = 0;
     }
     if (blockIdx.x < num_rows) {
-      index_t lane_nnz = 0;
-      const index_t offset = blockIdx.x * num_cols;
-      for (index_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = blockIdx.x * num_cols;
+      for (dim_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
         if (dns[offset+j] != 0) {
           lane_nnz++;
         }
       }
-      index_t aggr = BlockReduce(temp_storage).Sum(lane_nnz);
+      dim_t aggr = BlockReduce(temp_storage).Sum(lane_nnz);
       if (threadIdx.x == 0) {
         indptr[blockIdx.x+1] = aggr;
       }
@@ -485,19 +508,23 @@ struct FillCsrIndPtrBlockKernel {
 struct FillCsrColIdxAndValsBlockKernel {
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
-                                             DType* val, CType* col_idx,
-                                             const IType* indptr, const DType* dns,
-                                             const index_t num_rows, const index_t num_cols) {
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
     using mshadow::cuda::kBaseThreadNum;
-    typedef cub::BlockScan<index_t, kBaseThreadNum> BlockScan;
+    using nnvm::dim_t;
+    typedef cub::BlockScan<dim_t, kBaseThreadNum> BlockScan;
     __shared__ typename BlockScan::TempStorage temp_storage;
-    __shared__ volatile index_t block_nnz;
+    __shared__ volatile dim_t block_nnz;
 
     if (blockIdx.x < num_rows) {
-      const index_t offset = blockIdx.x * num_cols;
-      index_t k = indptr[blockIdx.x];
-      index_t nnz;
-      for (index_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
+      const dim_t offset = blockIdx.x * num_cols;
+      dim_t k = indptr[blockIdx.x];
+      dim_t nnz;
+      for (dim_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
         nnz = 0;
         if (j < num_cols) {
           if (dns[offset+j] != 0) {
@@ -538,16 +565,17 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
   CHECK_EQ(dns.shape_, csr->shape());
   using mshadow::Shape1;
   using mxnet_op::Kernel;
+  using nnvm::dim_t;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {                     // data type
     MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {   // col_idx type
-        const index_t num_rows = dns.shape_[0];
-        const index_t num_cols = dns.shape_[1];
-        const index_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
-        const index_t threads_per_block = mshadow::cuda::kBaseThreadNum;
-        const index_t min_num_warps = 512;
-        index_t num_threads;
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
+        const dim_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
+        const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+        const dim_t min_num_warps = 512;
+        dim_t num_threads;
         // TODO: remove kernel dependency on warpSize=32
         if (threads_per_warp != 32) {
           LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32";
@@ -622,8 +650,8 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
         CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
 
         // Allocate column index array and data array of the csr matrix
-        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(nnz)));
-        csr->CheckAndAllocData(Shape1(static_cast<index_t>(nnz)));
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<dim_t>(nnz)));
+        csr->CheckAndAllocData(Shape1(static_cast<dim_t>(nnz)));
 
         // Compute and fill column index array and data array of the csr matrix
         switch (kernel_version) {