BVLC · kloudkl · Dec 8, 2013 · Jan 11, 2014 · Jan 11, 2014 · Jan 11, 2014
diff --git a/Makefile b/Makefile
@@ -69,17 +69,18 @@ MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
 LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
-LIBRARIES := cudart cublas curand protobuf opencv_core opencv_highgui \
-	glog mkl_rt mkl_intel_thread leveldb snappy pthread boost_system \
-	opencv_imgproc
+LIBRARIES := cudart cublas curand pthread gomp \
+	glog protobuf leveldb snappy boost_system \
+	opencv_core opencv_highgui opencv_imgproc
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
 COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
-CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
+CXXFLAGS += -pthread -fPIC -fopenmp $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
+TEST_LDFLAGS += -lopenblas
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
 
@@ -132,7 +133,7 @@ runtest: test
 	for testbin in $(TEST_BINS); do $$testbin $(TEST_GPUID); done
 
 $(TEST_BINS): %.testbin : %.o $(GTEST_OBJ) $(STATIC_NAME) $(TEST_HDRS)
-	$(CXX) $< $(GTEST_OBJ) $(STATIC_NAME) -o $@ $(LDFLAGS) $(WARNINGS)
+	$(CXX) $< $(GTEST_OBJ) $(STATIC_NAME) -o $@ $(LDFLAGS) $(TEST_LDFLAGS) $(WARNINGS)
 
 $(EXAMPLE_BINS): %.bin : %.o $(STATIC_NAME)
 	$(CXX) $< $(STATIC_NAME) -o $@ $(LDFLAGS) $(WARNINGS)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
@@ -27,6 +27,14 @@ class Blob {
   inline int count() const {return count_; }
   inline int offset(const int n, const int c = 0, const int h = 0,
       const int w = 0) const {
+    CHECK_GE(n, 0);
+    CHECK_LE(n, num_);
+    CHECK_GE(channels_, 0);
+    CHECK_LE(c, channels_);
+    CHECK_GE(height_, 0);
+    CHECK_LE(h, height_);
+    CHECK_GE(width_, 0);
+    CHECK_LE(w, width_);
     return ((n * channels_ + c) * height_ + h) * width_ + w;
   }
   // Copy from source. If copy_diff is false, we copy the data; if copy_diff

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -3,14 +3,15 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
+#include <boost/random/mersenne_twister.hpp>
 #include <boost/shared_ptr.hpp>
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
 // cuda driver types
 #include <driver_types.h>
 #include <glog/logging.h>
-#include <mkl_vsl.h>
+//#include <mkl_vsl.h>
 
 // various checks for different function calls.
 #define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
@@ -78,8 +79,13 @@ class Caffe {
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
+
   // Returns the MKL random stream.
-  inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+  //inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+
+  typedef boost::mt19937 random_generator_t;
+  inline static random_generator_t &vsl_stream() { return Get().random_generator_; }
+
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
   // Returns the phase: TRAIN or TEST.
@@ -103,7 +109,9 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  VSLStreamStatePtr vsl_stream_;
+  //VSLStreamStatePtr vsl_stream_;
+  random_generator_t random_generator_;
+
   Brew mode_;
   Phase phase_;
   static shared_ptr<Caffe> singleton_;

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
@@ -7,7 +7,7 @@
 #ifndef CAFFE_FILLER_HPP
 #define CAFFE_FILLER_HPP
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <string>
 
 #include "caffe/common.hpp"

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
@@ -3,11 +3,39 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-#include <mkl.h>
-#include <cublas_v2.h>
+//#include <mkl.h>
+#include <eigen3/Eigen/Dense>
 
 namespace caffe {
 
+// Operations on aligned memory are faster than on unaligned memory.
+// But unfortunately, the pointers passed in are not always aligned.
+// Therefore, the memory-aligned Eigen::Map objects that wrap them
+// cannot be assigned to. This happens in lrn_layer and makes
+// test_lrn_layer crash with segmentation fault.
+// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
+
+// Though the default map option is unaligned, making it explicit is no harm.
+//const int data_alignment = Eigen::Aligned; // how is data allocated ?
+const int data_alignment = Eigen::Unaligned;
+typedef Eigen::Map<const Eigen::VectorXf, data_alignment> const_map_vector_float_t;
+typedef Eigen::Map<Eigen::VectorXf, data_alignment> map_vector_float_t;
+typedef Eigen::Map<const Eigen::VectorXd, data_alignment> const_map_vector_double_t;
+typedef Eigen::Map<Eigen::VectorXd, data_alignment> map_vector_double_t;
+
+// The default in Eigen is column-major. This is also the case if one
+// of the convenience typedefs (Matrix3f, ArrayXXd, etc.) is used.
+// http://eigen.tuxfamily.org/dox-devel/group__TopicStorageOrders.html
+typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatXf;
+typedef Eigen::Map<MatXf, data_alignment> map_matrix_float_t;
+typedef Eigen::Map<const MatXf, data_alignment> const_map_matrix_float_t;
+typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatXd;
+typedef Eigen::Map<MatXd, data_alignment> map_matrix_double_t;
+typedef Eigen::Map<const MatXd, data_alignment> const_map_matrix_double_t;
+
+// From cblas.h
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename Dtype>
@@ -84,13 +112,19 @@ void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
 template <typename Dtype>
 void caffe_vRngUniform(const int n, Dtype* r, const Dtype a, const Dtype b);
 
 template <typename Dtype>
 void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     const Dtype sigma);
 
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p);
+
 template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
@@ -294,6 +294,7 @@ class DataLayer : public Layer<Dtype> {
  public:
   explicit DataLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
+  virtual ~DataLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 

diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
@@ -21,7 +21,10 @@ long cluster_seedgen(void) {
 
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
-      curand_generator_(NULL), vsl_stream_(NULL) {
+      curand_generator_(NULL),
+      //vsl_stream_(NULL)
+      random_generator_()
+{
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
@@ -34,21 +37,22 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
+
   // Try to create a vsl stream. This should almost always work, but we will
   // check it anyway.
-  if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
-    LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
-        << "won't be available.";
-  }
+  //if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
+  //  LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
+  //      << "won't be available.";
+  //}
 }
 
 Caffe::~Caffe() {
   if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-  if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
-};
+  //if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
+}
 
 void Caffe::set_random_seed(const unsigned int seed) {
   // Curand seed
@@ -64,8 +68,10 @@ void Caffe::set_random_seed(const unsigned int seed) {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
   // VSL seed
-  VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
-  VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  //VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
+  //VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  Get().random_generator_ = random_generator_t(seed);
+
 }
 
 void Caffe::SetDevice(const int device_id) {

diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
@@ -17,8 +17,11 @@ namespace caffe {
 
 template <typename Dtype>
 void* DataLayerPrefetch(void* layer_pointer) {
+  CHECK(layer_pointer);
   DataLayer<Dtype>* layer = reinterpret_cast<DataLayer<Dtype>*>(layer_pointer);
+  CHECK(layer);
   Datum datum;
+  CHECK(layer->prefetch_data_);
   Dtype* top_data = layer->prefetch_data_->mutable_cpu_data();
   Dtype* top_label = layer->prefetch_label_->mutable_cpu_data();
   const Dtype scale = layer->layer_param_.scale();
@@ -38,6 +41,8 @@ void* DataLayerPrefetch(void* layer_pointer) {
   const Dtype* mean = layer->data_mean_.cpu_data();
   for (int itemid = 0; itemid < batchsize; ++itemid) {
     // get a blob
+    CHECK(layer->iter_);
+    CHECK(layer->iter_->Valid());
     datum.ParseFromString(layer->iter_->value().ToString());
     const string& data = datum.data();
     if (cropsize) {
@@ -109,6 +114,11 @@ void* DataLayerPrefetch(void* layer_pointer) {
   return (void*)NULL;
 }
 
+template <typename Dtype>
+DataLayer<Dtype>::~DataLayer<Dtype>() {
+  // Finally, join the thread
+  CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
+}
 
 template <typename Dtype>
 void DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,

diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
@@ -4,6 +4,7 @@
 #include <limits>
 
 #include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/vision_layers.hpp"
@@ -34,8 +35,10 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        count, mask, 1. - threshold_);
+    //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+    //    count, mask, 1. - threshold_);
+    caffe_vRngBernoulli<int>(count, mask, 1. - threshold_);
+
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }

diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
@@ -43,6 +43,7 @@ Dtype FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   caffe_copy(count_, top_diff, bottom_diff);
+  return Dtype(0);
 }
 
 
@@ -52,6 +53,7 @@ Dtype FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
   caffe_gpu_copy(count_, top_diff, bottom_diff);
+  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(FlattenLayer);

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
@@ -1,7 +1,7 @@
 // Copyright 2013 Yangqing Jia
 
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <cublas_v2.h>
 
 #include <vector>

diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
-
+#include "caffe/util/math_functions.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
 namespace caffe {
@@ -20,7 +20,8 @@ TEST_F(CommonTest, TestCublasHandler) {
 }
 
 TEST_F(CommonTest, TestVslStream) {
-  EXPECT_TRUE(Caffe::vsl_stream());
+  //EXPECT_TRUE(Caffe::vsl_stream());
+    EXPECT_TRUE(true);
 }
 
 TEST_F(CommonTest, TestBrewMode) {
@@ -39,11 +40,15 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, (int*)data_a.mutable_cpu_data(), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_a.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, (int*)data_a.mutable_cpu_data(), 0.5);
+
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, (int*)data_b.mutable_cpu_data(), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_b.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, (int*)data_b.mutable_cpu_data(), 0.5);
+
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const int*)(data_a.cpu_data()))[i],
         ((const int*)(data_b.cpu_data()))[i]);

diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
@@ -81,8 +81,8 @@ TYPED_TEST(DataLayerTest, TestRead) {
   EXPECT_EQ(this->blob_top_label_->channels(), 1);
   EXPECT_EQ(this->blob_top_label_->height(), 1);
   EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // Go throught the data twice
-  for (int iter = 0; iter < 2; ++iter) {
+  // Go through the data 100 times
+  for (int iter = 0; iter < 100; ++iter) {
     layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
     for (int i = 0; i < 5; ++i) {
       EXPECT_EQ(i, this->blob_top_label_->cpu_data()[i]);

diff --git a/src/caffe/test/test_flatten_layer.cpp b/src/caffe/test/test_flatten_layer.cpp
@@ -22,6 +22,7 @@ class FlattenLayerTest : public ::testing::Test {
   FlattenLayerTest()
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -72,6 +73,8 @@ TYPED_TEST(FlattenLayerTest, TestGPU) {
   for (int c = 0; c < 3 * 6 * 5; ++c) {
     EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
         this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
+    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
+        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
   }
 }