Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

don't save num_thread as possible #2839

Merged
merged 8 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ Core Parameters

- for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication

- **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors

- ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``

- device for the tree learning, you can use GPU to achieve the faster learning
Expand Down
1 change: 1 addition & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ struct Config {
// desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
// desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
// desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
// desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
int num_threads = 0;

// [doc-only]
Expand Down
5 changes: 1 addition & 4 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,7 @@ struct TrainingTempState {
return;
}
multi_val_bin.reset(bin);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
num_bin_aligned =
(bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
Expand Down
7 changes: 1 addition & 6 deletions include/LightGBM/utils/array_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@ template<typename VAL_T>
class ArrayArgs {
public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
std::vector<size_t> arg_maxs(num_threads, 0);
int n_blocks = Threading::For<size_t>(
0, array.size(), 1024,
Expand Down
12 changes: 2 additions & 10 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,12 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
size_t len = _Last - _First;
const size_t kMinInnerLen = 1024;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
if (len <= kMinInnerLen || num_threads <= 1) {
std::sort(_First, _Last, _Pred);
return;
Expand Down Expand Up @@ -1032,10 +1027,7 @@ class Timer {
public:
Timer() {
#ifdef TIMETAG
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
start_time_.resize(num_threads);
stats_.resize(num_threads);
#endif // TIMETAG
Expand Down
9 changes: 9 additions & 0 deletions include/LightGBM/utils/openmp_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@
#include <stdexcept>
#include <vector>

inline int OMP_NUM_THREADS() {
int ret = 1;
#pragma omp parallel
#pragma omp master
{ ret = omp_get_num_threads(); }
return ret;
}

class ThreadExceptionHelper {
public:
ThreadExceptionHelper() {
Expand Down Expand Up @@ -70,6 +78,7 @@ class ThreadExceptionHelper {
inline void omp_set_num_threads(int) {}
inline int omp_get_num_threads() {return 1;}
inline int omp_get_thread_num() {return 0;}
inline int OMP_NUM_THREADS() { return 1; }
#ifdef __cplusplus
}; // extern "C"
#endif
Expand Down
10 changes: 2 additions & 8 deletions include/LightGBM/utils/threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ class Threading {
template <typename INDEX_T>
static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
int* out_nblock, INDEX_T* block_size) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size);
}
Expand Down Expand Up @@ -84,10 +81,7 @@ class ParallelPartitionRunner {
public:
ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
: min_block_size_(min_block_size) {
num_threads_ = 1;
#pragma omp parallel
#pragma omp master
{ num_threads_ = omp_get_num_threads(); }
num_threads_ = OMP_NUM_THREADS();
left_.resize(num_data);
if (TWO_BUFFER) {
right_.resize(num_data);
Expand Down
6 changes: 1 addition & 5 deletions src/application/predictor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,13 @@ class Predictor {
}
}

#pragma omp parallel
#pragma omp master
{ num_threads_ = omp_get_num_threads(); }
boosting->InitPredict(num_iteration, predict_contrib);
boosting_ = boosting;
num_pred_one_row_ = boosting_->NumPredictOneRow(
num_iteration, predict_leaf_index, predict_contrib);
num_feature_ = boosting_->MaxFeatureIdx() + 1;
predict_buf_.resize(
num_threads_,
OMP_NUM_THREADS(),
std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
num_feature_, 0.0f));
const int kFeatureThreshold = 100000;
Expand Down Expand Up @@ -281,7 +278,6 @@ class Predictor {
PredictionEarlyStopInstance early_stop_;
int num_feature_;
int num_pred_one_row_;
int num_threads_;
std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
};

Expand Down
7 changes: 1 addition & 6 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1529,12 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
if (config.num_threads > 0) {
omp_set_num_threads(config.num_threads);
}
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
int ncol = static_cast<int>(ncol_ptr - 1);
std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
for (int i = 0; i < num_threads; ++i) {
Expand Down
15 changes: 3 additions & 12 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,10 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
}
const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
const int num_feature = feature_groups_[multi_group_id]->num_feature_;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();

std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
Expand Down Expand Up @@ -539,10 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
global_timer);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
double sum_dense_ratio = 0;

std::unique_ptr<MultiValBin> ret;
Expand Down Expand Up @@ -1185,10 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
if (multi_val_bin == nullptr) {
return;
}
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();

global_timer.Start("Dataset::sparse_bin_histogram");
const int num_bin = multi_val_bin->num_bin();
Expand Down
5 changes: 1 addition & 4 deletions src/io/multi_val_sparse_bin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@ class MultiValSparseBin : public MultiValBin {
estimate_element_per_row_(estimate_element_per_row) {
row_ptr_.resize(num_data_ + 1, 0);
INDEX_T estimate_num_data = static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
if (num_threads > 1) {
t_data_.resize(num_threads - 1);
for (size_t i = 0; i < t_data_.size(); ++i) {
Expand Down
7 changes: 1 addition & 6 deletions src/io/sparse_bin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,7 @@ class SparseBin: public Bin {

explicit SparseBin(data_size_t num_data)
: num_data_(num_data) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
push_buffers_.resize(num_threads);
}

Expand Down
13 changes: 3 additions & 10 deletions src/metric/map_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ class MapMetric:public Metric {
// get eval position
eval_at_ = config.eval_at;
DCGCalculator::DefaultEvalAt(&eval_at_);
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

~MapMetric() {
Expand Down Expand Up @@ -110,8 +104,9 @@ class MapMetric:public Metric {
}
std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
// some buffers for multi-threading sum up
int num_thread = OMP_NUM_THREADS();
guolinke marked this conversation as resolved.
Show resolved Hide resolved
std::vector<std::vector<double>> result_buffer_;
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_thread; ++i) {
guolinke marked this conversation as resolved.
Show resolved Hide resolved
result_buffer_.emplace_back(eval_at_.size(), 0.0f);
}
std::vector<double> tmp_map(eval_at_.size(), 0.0f);
Expand Down Expand Up @@ -139,7 +134,7 @@ class MapMetric:public Metric {
// Get final average MAP
std::vector<double> result(eval_at_.size(), 0.0f);
for (size_t j = 0; j < result.size(); ++j) {
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_thread; ++i) {
guolinke marked this conversation as resolved.
Show resolved Hide resolved
result[j] += result_buffer_[i][j];
}
result[j] /= sum_query_weights_;
Expand All @@ -162,8 +157,6 @@ class MapMetric:public Metric {
double sum_query_weights_;
/*! \brief Evaluate position of Nmap */
std::vector<data_size_t> eval_at_;
/*! \brief Number of threads */
int num_threads_;
std::vector<std::string> name_;
std::vector<data_size_t> npos_per_query_;
};
Expand Down
13 changes: 3 additions & 10 deletions src/metric/rank_metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ class NDCGMetric:public Metric {
DCGCalculator::DefaultLabelGain(&label_gain);
// initialize DCG calculator
DCGCalculator::Init(label_gain);
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

~NDCGMetric() {
Expand Down Expand Up @@ -89,9 +83,10 @@ class NDCGMetric:public Metric {
}

std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
int num_threads = OMP_NUM_THREADS();
// some buffers for multi-threading sum up
std::vector<std::vector<double>> result_buffer_;
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result_buffer_.emplace_back(eval_at_.size(), 0.0f);
}
std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
Expand Down Expand Up @@ -139,7 +134,7 @@ class NDCGMetric:public Metric {
// Get final average NDCG
std::vector<double> result(eval_at_.size(), 0.0f);
for (size_t j = 0; j < result.size(); ++j) {
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result[j] += result_buffer_[i][j];
}
result[j] /= sum_query_weights_;
Expand All @@ -166,8 +161,6 @@ class NDCGMetric:public Metric {
std::vector<data_size_t> eval_at_;
/*! \brief Cache the inverse max dcg for all queries */
std::vector<std::vector<double>> inverse_max_dcgs_;
/*! \brief Number of threads */
int num_threads_;
};

} // namespace LightGBM
Expand Down
5 changes: 3 additions & 2 deletions src/treelearner/data_parallel_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {

template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
if (this->config_->feature_fraction_bynode < 1.0f) {
Expand Down
10 changes: 3 additions & 7 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const Config* config)
:config_(config) {
random_ = Random(config_->feature_fraction_seed);
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

SerialTreeLearner::~SerialTreeLearner() {
Expand Down Expand Up @@ -400,8 +395,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
const std::vector<int8_t>& is_feature_used, bool use_subtract) {
Common::FunctionTimer fun_timer(
"SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_best(num_threads);
std::vector<SplitInfo> larger_best(num_threads);
std::vector<int8_t> smaller_node_used_features(num_features_, 1);
std::vector<int8_t> larger_node_used_features(num_features_, 1);
if (config_->feature_fraction_bynode < 1.0f) {
Expand Down
1 change: 0 additions & 1 deletion src/treelearner/serial_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ class SerialTreeLearner: public TreeLearner {
HistogramPool histogram_pool_;
/*! \brief config of tree learner*/
const Config* config_;
int num_threads_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;
std::unique_ptr<TrainingTempState> temp_state_;
Expand Down
5 changes: 3 additions & 2 deletions src/treelearner/voting_parallel_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {

template <typename TREELEARNER_T>
void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
std::vector<SplitInfo> larger_best_per_thread(num_threads);
std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
if (this->config_->feature_fraction_bynode < 1.0f) {
Expand Down