From 953cd393f05ef2c470f04bbc804bafb7af5624c7 Mon Sep 17 00:00:00 2001
From: guolinke <guolin.ke@outlook.com>
Date: Fri, 28 Feb 2020 17:42:09 +0800
Subject: [PATCH 1/7] don't cache `num_thread`, to avoid change outside

---
 include/LightGBM/dataset.h                       |  5 +----
 include/LightGBM/utils/array_args.h              |  7 +------
 include/LightGBM/utils/common.h                  | 12 ++----------
 include/LightGBM/utils/openmp_wrapper.h          |  9 +++++++++
 include/LightGBM/utils/threading.h               | 10 ++--------
 src/application/predictor.hpp                    |  6 +-----
 src/c_api.cpp                                    |  7 +------
 src/io/dataset.cpp                               | 15 +++------------
 src/io/multi_val_sparse_bin.hpp                  |  5 +----
 src/io/sparse_bin.hpp                            |  7 +------
 src/metric/map_metric.hpp                        | 13 +++----------
 src/metric/rank_metric.hpp                       | 13 +++----------
 src/treelearner/data_parallel_tree_learner.cpp   |  5 +++--
 src/treelearner/serial_tree_learner.cpp          | 10 +++-------
 src/treelearner/serial_tree_learner.h            |  1 -
 src/treelearner/voting_parallel_tree_learner.cpp |  5 +++--
 16 files changed, 37 insertions(+), 93 deletions(-)
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 0ba7dd71764b..8846d7ade3af 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -292,10 +292,7 @@ struct TrainingTempState {
       return;
     }
     multi_val_bin.reset(bin);
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = omp_get_num_threads();
     num_bin_aligned =
         (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
     size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
index 49beb077823f..2935f1619397 100644
--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
@@ -21,12 +21,7 @@ template<typename VAL_T>
 class ArrayArgs {
  public:
   inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
+    int num_threads = omp_get_num_threads();
     std::vector<size_t> arg_maxs(num_threads, 0);
     int n_blocks = Threading::For<size_t>(
         0, array.size(), 1024,
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 1e447e973c2e..8886fe2ec226 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -727,12 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
 static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
   size_t len = _Last - _First;
   const size_t kMinInnerLen = 1024;
-  int num_threads = 1;
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
+  int num_threads = omp_num_threads();
   if (len <= kMinInnerLen || num_threads <= 1) {
     std::sort(_First, _Last, _Pred);
     return;
@@ -1032,10 +1027,7 @@ class Timer {
  public:
   Timer() {
 #ifdef TIMETAG
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = omp_num_threads();
     start_time_.resize(num_threads);
     stats_.resize(num_threads);
 #endif  // TIMETAG
diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
index 20c90e063291..4bb93463d74b 100644
--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -15,6 +15,14 @@
 #include <stdexcept>
 #include <vector>
 
+inline int omp_num_threads() {
+  int ret = 1;
+#pragma omp parallel
+#pragma omp master
+  { ret = omp_get_num_threads(); }
+  return ret;
+}
+
 class ThreadExceptionHelper {
  public:
   ThreadExceptionHelper() {
@@ -70,6 +78,7 @@ class ThreadExceptionHelper {
   inline void omp_set_num_threads(int) {}
   inline int omp_get_num_threads() {return 1;}
   inline int omp_get_thread_num() {return 0;}
+  inline int omp_num_threads() { return 1; }
 #ifdef __cplusplus
 };  // extern "C"
 #endif
diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
index c4cb57a49d73..4a0acedc0cb0 100644
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -21,10 +21,7 @@ class Threading {
   template <typename INDEX_T>
   static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
                                int* out_nblock, INDEX_T* block_size) {
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = omp_num_threads();
     BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
                        block_size);
   }
@@ -84,10 +81,7 @@ class ParallelPartitionRunner {
  public:
   ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
       : min_block_size_(min_block_size) {
-    num_threads_ = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
+    num_threads_ = omp_get_num_threads();
     left_.resize(num_data);
     if (TWO_BUFFER) {
       right_.resize(num_data);
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index 6580186cb396..65c2fc88df7a 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -56,16 +56,13 @@ class Predictor {
       }
     }
 
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
     boosting->InitPredict(num_iteration, predict_contrib);
     boosting_ = boosting;
     num_pred_one_row_ = boosting_->NumPredictOneRow(
         num_iteration, predict_leaf_index, predict_contrib);
     num_feature_ = boosting_->MaxFeatureIdx() + 1;
     predict_buf_.resize(
-        num_threads_,
+        omp_get_num_threads(),
         std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
             num_feature_, 0.0f));
     const int kFeatureThreshold = 100000;
@@ -281,7 +278,6 @@ class Predictor {
   PredictionEarlyStopInstance early_stop_;
   int num_feature_;
   int num_pred_one_row_;
-  int num_threads_;
   std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
 };
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 60c32ce30968..763996d43c5b 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1529,12 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-  int num_threads = 1;
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
+  int num_threads = omp_get_num_threads();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
   for (int i = 0; i < num_threads; ++i) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 888bb22be963..ab4e53b8acf7 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -506,10 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
   }
   const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
   const int num_feature = feature_groups_[multi_group_id]->num_feature_;
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = omp_get_num_threads();
 
   std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
   std::vector<uint32_t> most_freq_bins;
@@ -539,10 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
 MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
   Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                  global_timer);
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = omp_get_num_threads();
   double sum_dense_ratio = 0;
 
   std::unique_ptr<MultiValBin> ret;
@@ -1185,10 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
   if (multi_val_bin == nullptr) {
     return;
   }
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = omp_get_num_threads();
 
   global_timer.Start("Dataset::sparse_bin_histogram");
   const int num_bin = multi_val_bin->num_bin();
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 7af83c92cbbd..47259ac9d3d5 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -27,10 +27,7 @@ class MultiValSparseBin : public MultiValBin {
     INDEX_T estimate_num_data =
         static_cast<INDEX_T>(estimate_element_per_row_ * 1.1) *
         static_cast<INDEX_T>(num_data_);
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = omp_get_num_threads();
     if (num_threads > 1) {
       t_data_.resize(num_threads - 1);
       for (size_t i = 0; i < t_data_.size(); ++i) {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 591add46f0ed..476e2cacd62a 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -73,12 +73,7 @@ class SparseBin: public Bin {
 
   explicit SparseBin(data_size_t num_data)
     : num_data_(num_data) {
-    int num_threads = 1;
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
+    int num_threads = omp_get_num_threads();
     push_buffers_.resize(num_threads);
   }
 
diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index f6f79e1a4678..7d9c27fe1f36 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -23,12 +23,6 @@ class MapMetric:public Metric {
     // get eval position
     eval_at_ = config.eval_at;
     DCGCalculator::DefaultEvalAt(&eval_at_);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
   }
 
   ~MapMetric() {
@@ -110,8 +104,9 @@ class MapMetric:public Metric {
   }
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
     // some buffers for multi-threading sum up
+    int num_thread = omp_get_num_threads();
     std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_thread; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
     }
     std::vector<double> tmp_map(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class MapMetric:public Metric {
     // Get final average MAP
     std::vector<double> result(eval_at_.size(), 0.0f);
     for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_thread; ++i) {
         result[j] += result_buffer_[i][j];
       }
       result[j] /= sum_query_weights_;
@@ -162,8 +157,6 @@ class MapMetric:public Metric {
   double sum_query_weights_;
   /*! \brief Evaluate position of Nmap */
   std::vector<data_size_t> eval_at_;
-  /*! \brief Number of threads */
-  int num_threads_;
   std::vector<std::string> name_;
   std::vector<data_size_t> npos_per_query_;
 };
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
index 9c258dde6201..2c08256a2bb5 100644
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -26,12 +26,6 @@ class NDCGMetric:public Metric {
     DCGCalculator::DefaultLabelGain(&label_gain);
     // initialize DCG calculator
     DCGCalculator::Init(label_gain);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
   }
 
   ~NDCGMetric() {
@@ -89,9 +83,10 @@ class NDCGMetric:public Metric {
   }
 
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
+    int num_threads = omp_num_threads();
     // some buffers for multi-threading sum up
     std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
     }
     std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class NDCGMetric:public Metric {
     // Get final average NDCG
     std::vector<double> result(eval_at_.size(), 0.0f);
     for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
         result[j] += result_buffer_[i][j];
       }
       result[j] /= sum_query_weights_;
@@ -166,8 +161,6 @@ class NDCGMetric:public Metric {
   std::vector<data_size_t> eval_at_;
   /*! \brief Cache the inverse max dcg for all queries */
   std::vector<std::vector<double>> inverse_max_dcgs_;
-  /*! \brief Number of threads */
-  int num_threads_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 7c8c0d4c5cda..6e063fa7b579 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -165,8 +165,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
-  std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
+  int num_threads = omp_num_threads();
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
+  std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
   if (this->config_->feature_fraction_bynode < 1.0f) {
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 6f9e65397853..b275fa6345b1 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -21,11 +21,6 @@ namespace LightGBM {
 SerialTreeLearner::SerialTreeLearner(const Config* config)
   :config_(config) {
   random_ = Random(config_->feature_fraction_seed);
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads_ = omp_get_num_threads();
-  }
 }
 
 SerialTreeLearner::~SerialTreeLearner() {
@@ -400,8 +395,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     const std::vector<int8_t>& is_feature_used, bool use_subtract) {
   Common::FunctionTimer fun_timer(
       "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
-  std::vector<SplitInfo> smaller_best(num_threads_);
-  std::vector<SplitInfo> larger_best(num_threads_);
+  int num_threads = omp_num_threads();
+  std::vector<SplitInfo> smaller_best(num_threads);
+  std::vector<SplitInfo> larger_best(num_threads);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
   std::vector<int8_t> larger_node_used_features(num_features_, 1);
   if (config_->feature_fraction_bynode < 1.0f) {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index dd267208eadd..197ed032c484 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -188,7 +188,6 @@ class SerialTreeLearner: public TreeLearner {
   HistogramPool histogram_pool_;
   /*! \brief config of tree learner*/
   const Config* config_;
-  int num_threads_;
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
   std::unique_ptr<TrainingTempState> temp_state_;
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 71ebbf9a97e3..412992a389e1 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -349,8 +349,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
-  std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
+  int num_threads = omp_num_threads();
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
+  std::vector<SplitInfo> larger_best_per_thread(num_threads);
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
   if (this->config_->feature_fraction_bynode < 1.0f) {

From 8bc8510dab162700ce98976ea352469f5ca3eadc Mon Sep 17 00:00:00 2001
From: guolinke <guolin.ke@outlook.com>
Date: Fri, 28 Feb 2020 17:47:15 +0800
Subject: [PATCH 2/7] rename

---
 include/LightGBM/dataset.h                       | 2 +-
 include/LightGBM/utils/array_args.h              | 2 +-
 include/LightGBM/utils/common.h                  | 4 ++--
 include/LightGBM/utils/openmp_wrapper.h          | 4 ++--
 include/LightGBM/utils/threading.h               | 4 ++--
 src/application/predictor.hpp                    | 2 +-
 src/c_api.cpp                                    | 2 +-
 src/io/dataset.cpp                               | 6 +++---
 src/io/multi_val_sparse_bin.hpp                  | 2 +-
 src/io/sparse_bin.hpp                            | 2 +-
 src/metric/map_metric.hpp                        | 2 +-
 src/metric/rank_metric.hpp                       | 2 +-
 src/treelearner/data_parallel_tree_learner.cpp   | 2 +-
 src/treelearner/serial_tree_learner.cpp          | 2 +-
 src/treelearner/voting_parallel_tree_learner.cpp | 2 +-
 15 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 8846d7ade3af..d63f1c966960 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -292,7 +292,7 @@ struct TrainingTempState {
       return;
     }
     multi_val_bin.reset(bin);
-    int num_threads = omp_get_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     num_bin_aligned =
         (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
     size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
index 2935f1619397..0183ecc22ddb 100644
--- a/include/LightGBM/utils/array_args.h
+++ b/include/LightGBM/utils/array_args.h
@@ -21,7 +21,7 @@ template<typename VAL_T>
 class ArrayArgs {
  public:
   inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
-    int num_threads = omp_get_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     std::vector<size_t> arg_maxs(num_threads, 0);
     int n_blocks = Threading::For<size_t>(
         0, array.size(), 1024,
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 8886fe2ec226..fffed09fc288 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -727,7 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
 static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
   size_t len = _Last - _First;
   const size_t kMinInnerLen = 1024;
-  int num_threads = omp_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   if (len <= kMinInnerLen || num_threads <= 1) {
     std::sort(_First, _Last, _Pred);
     return;
@@ -1027,7 +1027,7 @@ class Timer {
  public:
   Timer() {
 #ifdef TIMETAG
-    int num_threads = omp_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     start_time_.resize(num_threads);
     stats_.resize(num_threads);
 #endif  // TIMETAG
diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
index 4bb93463d74b..123ca37c1a05 100644
--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -15,7 +15,7 @@
 #include <stdexcept>
 #include <vector>
 
-inline int omp_num_threads() {
+inline int OMP_NUM_THREADS() {
   int ret = 1;
 #pragma omp parallel
 #pragma omp master
@@ -78,7 +78,7 @@ class ThreadExceptionHelper {
   inline void omp_set_num_threads(int) {}
   inline int omp_get_num_threads() {return 1;}
   inline int omp_get_thread_num() {return 0;}
-  inline int omp_num_threads() { return 1; }
+  inline int OMP_NUM_THREADS() { return 1; }
 #ifdef __cplusplus
 };  // extern "C"
 #endif
diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
index 4a0acedc0cb0..7202b47b85ad 100644
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -21,7 +21,7 @@ class Threading {
   template <typename INDEX_T>
   static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
                                int* out_nblock, INDEX_T* block_size) {
-    int num_threads = omp_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
                        block_size);
   }
@@ -81,7 +81,7 @@ class ParallelPartitionRunner {
  public:
   ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
       : min_block_size_(min_block_size) {
-    num_threads_ = omp_get_num_threads();
+    num_threads_ = OMP_NUM_THREADS();
     left_.resize(num_data);
     if (TWO_BUFFER) {
       right_.resize(num_data);
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index 65c2fc88df7a..a38e872de5b3 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -62,7 +62,7 @@ class Predictor {
         num_iteration, predict_leaf_index, predict_contrib);
     num_feature_ = boosting_->MaxFeatureIdx() + 1;
     predict_buf_.resize(
-        omp_get_num_threads(),
+        OMP_NUM_THREADS(),
         std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
             num_feature_, 0.0f));
     const int kFeatureThreshold = 100000;
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 763996d43c5b..bb8585502073 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1529,7 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-  int num_threads = omp_get_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
   for (int i = 0; i < num_threads; ++i) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index ab4e53b8acf7..7cf179038948 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -506,7 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
   }
   const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
   const int num_feature = feature_groups_[multi_group_id]->num_feature_;
-  int num_threads = omp_get_num_threads();
+  int num_threads = OMP_NUM_THREADS();
 
   std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
   std::vector<uint32_t> most_freq_bins;
@@ -536,7 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
 MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
   Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                  global_timer);
-  int num_threads = omp_get_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   double sum_dense_ratio = 0;
 
   std::unique_ptr<MultiValBin> ret;
@@ -1179,7 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
   if (multi_val_bin == nullptr) {
     return;
   }
-  int num_threads = omp_get_num_threads();
+  int num_threads = OMP_NUM_THREADS();
 
   global_timer.Start("Dataset::sparse_bin_histogram");
   const int num_bin = multi_val_bin->num_bin();
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 47259ac9d3d5..b2d06175bfb2 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -27,7 +27,7 @@ class MultiValSparseBin : public MultiValBin {
     INDEX_T estimate_num_data =
         static_cast<INDEX_T>(estimate_element_per_row_ * 1.1) *
         static_cast<INDEX_T>(num_data_);
-    int num_threads = omp_get_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     if (num_threads > 1) {
       t_data_.resize(num_threads - 1);
       for (size_t i = 0; i < t_data_.size(); ++i) {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 476e2cacd62a..57e507b6bf76 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -73,7 +73,7 @@ class SparseBin: public Bin {
 
   explicit SparseBin(data_size_t num_data)
     : num_data_(num_data) {
-    int num_threads = omp_get_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     push_buffers_.resize(num_threads);
   }
 
diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index 7d9c27fe1f36..ec0f5f2d8179 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -104,7 +104,7 @@ class MapMetric:public Metric {
   }
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
     // some buffers for multi-threading sum up
-    int num_thread = omp_get_num_threads();
+    int num_thread = OMP_NUM_THREADS();
     std::vector<std::vector<double>> result_buffer_;
     for (int i = 0; i < num_thread; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
index 2c08256a2bb5..3b3afb547eb9 100644
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -83,7 +83,7 @@ class NDCGMetric:public Metric {
   }
 
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
-    int num_threads = omp_num_threads();
+    int num_threads = OMP_NUM_THREADS();
     // some buffers for multi-threading sum up
     std::vector<std::vector<double>> result_buffer_;
     for (int i = 0; i < num_threads; ++i) {
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 6e063fa7b579..d0b732380763 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -165,7 +165,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  int num_threads = omp_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
   std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index b275fa6345b1..e2ae3038fe21 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -395,7 +395,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     const std::vector<int8_t>& is_feature_used, bool use_subtract) {
   Common::FunctionTimer fun_timer(
       "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
-  int num_threads = omp_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   std::vector<SplitInfo> smaller_best(num_threads);
   std::vector<SplitInfo> larger_best(num_threads);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 412992a389e1..d34c31ff5d4f 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -349,7 +349,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  int num_threads = omp_num_threads();
+  int num_threads = OMP_NUM_THREADS();
   std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
   std::vector<SplitInfo> larger_best_per_thread(num_threads);
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);

From ae3f216f028e5bc6c8a9e7e8e25b399f74ec9d3e Mon Sep 17 00:00:00 2001
From: guolinke <guolin.ke@outlook.com>
Date: Fri, 28 Feb 2020 18:29:44 +0800
Subject: [PATCH 3/7] update document

---
 docs/Parameters.rst       | 2 ++
 include/LightGBM/config.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 62f7f184609e..c811af0bd47b 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -171,6 +171,8 @@ Core Parameters
 
    -  for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
 
+   -  **Note**: Please **don't** change this during training, especifically when running multiple jobs simultaneously by external packages, otherwise may cause undesible errors.
+
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``
 
    -  device for the tree learning, you can use GPU to achieve the faster learning
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 6e5dd1b3ea51..243b11880f0f 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -192,6 +192,7 @@ struct Config {
   // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
   // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
   // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
+  // desc = **Note**: Please **don't** change this during training, especifically when running multiple jobs simultaneously by external packages, otherwise may cause undesible errors.
   int num_threads = 0;
 
   // [doc-only]

From bc4a26bc47e1e24e71a57bf1a6bb2962aae0b543 Mon Sep 17 00:00:00 2001
From: Guolin Ke <guolin.ke@outlook.com>
Date: Fri, 28 Feb 2020 18:39:05 +0800
Subject: [PATCH 4/7] Update docs/Parameters.rst

---
 docs/Parameters.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index c811af0bd47b..4c2c852e7c68 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -171,7 +171,7 @@ Core Parameters
 
    -  for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
 
-   -  **Note**: Please **don't** change this during training, especifically when running multiple jobs simultaneously by external packages, otherwise may cause undesible errors.
+   -  **Note**: Please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors.
 
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``
 

From 77854ac399bdce5a4293200c784b0eeb98780344 Mon Sep 17 00:00:00 2001
From: Guolin Ke <guolin.ke@outlook.com>
Date: Fri, 28 Feb 2020 18:39:41 +0800
Subject: [PATCH 5/7] Update include/LightGBM/config.h

---
 include/LightGBM/config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 243b11880f0f..beab1bd45afb 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -192,7 +192,7 @@ struct Config {
   // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
   // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
   // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
-  // desc = **Note**: Please **don't** change this during training, especifically when running multiple jobs simultaneously by external packages, otherwise may cause undesible errors.
+  // desc = **Note**: Please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors.
   int num_threads = 0;
 
   // [doc-only]

From 76cec0e670cf954da6682c5ba05fe79a70f92313 Mon Sep 17 00:00:00 2001
From: Guolin Ke <guolin.ke@outlook.com>
Date: Sat, 29 Feb 2020 10:32:44 +0800
Subject: [PATCH 6/7] Apply suggestions from code review

Co-Authored-By: Nikita Titov <nekit94-08@mail.ru>
---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 4c2c852e7c68..573602c47f61 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -171,7 +171,7 @@ Core Parameters
 
    -  for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
 
-   -  **Note**: Please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors.
+   -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
 
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index beab1bd45afb..8ca2066e141b 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -192,7 +192,7 @@ struct Config {
   // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
   // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
   // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
-  // desc = **Note**: Please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors.
+  // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
   int num_threads = 0;
 
   // [doc-only]

From 560f652c6b8e70ac5d6c7b06ab8cf22081449e84 Mon Sep 17 00:00:00 2001
From: Guolin Ke <guolin.ke@outlook.com>
Date: Sat, 29 Feb 2020 23:16:47 +0800
Subject: [PATCH 7/7] Apply suggestions from code review

Co-Authored-By: Nikita Titov <nekit94-08@mail.ru>
---
 src/metric/map_metric.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index ec0f5f2d8179..18539ee44ee0 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -104,9 +104,9 @@ class MapMetric:public Metric {
   }
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
     // some buffers for multi-threading sum up
-    int num_thread = OMP_NUM_THREADS();
+    int num_threads = OMP_NUM_THREADS();
     std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_thread; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
     }
     std::vector<double> tmp_map(eval_at_.size(), 0.0f);
@@ -134,7 +134,7 @@ class MapMetric:public Metric {
     // Get final average MAP
     std::vector<double> result(eval_at_.size(), 0.0f);
     for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_thread; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
         result[j] += result_buffer_[i][j];
       }
       result[j] /= sum_query_weights_;