switch over 1-10 num_spatial_axes in {im2col,col2im}_gpu kernels

BVLC · Mar 6, 2015 · 5b75694 · 5b75694
1 parent d639086
commit 5b75694
Showing 1 changed file with 135 additions and 11 deletions.
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
@@ -78,10 +78,71 @@ void im2col_gpu(const Dtype* data_im, const int num_spatial_axes,
     const int num_kernels, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
     Dtype* data_col) {
-  im2col_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, im_shape, col_shape,
-      kernel_shape, pad, stride, data_col);
+  switch (num_spatial_axes) {
+  case 1:
+    im2col_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 2:
+    im2col_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 3:
+    im2col_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 4:
+    im2col_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 5:
+    im2col_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 6:
+    im2col_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 7:
+    im2col_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 8:
+    im2col_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 9:
+    im2col_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 10:
+    im2col_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  default:
+    LOG(FATAL) << "im2col_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
   CUDA_POST_KERNEL_CHECK;
 }
 
@@ -107,7 +168,6 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
   int d_col_start[num_axes];  // NOLINT(runtime/arrays)
   int d_col_end[num_axes];  // NOLINT(runtime/arrays)
   CUDA_KERNEL_LOOP(index, n) {
-    Dtype val = 0;
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
     int channel_im = index;
@@ -117,14 +177,16 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
       channel_im /= im_shape[i + 1];
     }
     // Calculate col start/end indices.
-    bool done = false;  // Skip computation if any dims are out of range.
+    bool done = false;
     for (int i = 0; i < num_axes; ++i) {
       d_col_start[i] = d_col_iter[i] =
           (d_im[i] < kernel_shape[i]) ?
           0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
       d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
       if (d_col_start[i] >= d_col_end[i]) {
-        data_im[index] = val;
+        // Skip computation if the dimension is 0 at any spatial axis --
+        // final val will be 0.
+        data_im[index] = 0;
         done = true;
         break;  // for (int i = 0; i < num_axes; ++i)
       }
@@ -133,6 +195,7 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
       continue;  // CUDA_KERNEL_LOOP(index, n)
     }
     // Loop over the col to compute the output val.
+    Dtype val = 0;
     bool incremented = true;
     do {
       // Compute the final offset.
@@ -170,10 +233,71 @@ void col2im_gpu(const Dtype* data_col, const int num_spatial_axes,
     const int im_size, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
     Dtype* data_im) {
-  col2im_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-        im_size, data_col, im_shape, col_shape,
-        kernel_shape, pad, stride, data_im);
+  switch (num_spatial_axes) {
+  case 1:
+    col2im_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 2:
+    col2im_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 3:
+    col2im_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 4:
+    col2im_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 5:
+    col2im_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 6:
+    col2im_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 7:
+    col2im_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 8:
+    col2im_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 9:
+    col2im_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 10:
+    col2im_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  default:
+    LOG(FATAL) << "im2col_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
   CUDA_POST_KERNEL_CHECK;
 }