diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2acdd083b9df..a5430c483d3b 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -352,7 +352,7 @@ namespace LightGBM { int zero_cnt = static_cast(total_sample_cnt - num_sample_values - na_cnt); // find distinct_values first std::vector distinct_values; - std::vector counts; + std::vector counts; // count of data points for each distinct feature value. std::stable_sort(values, values + num_sample_values); @@ -389,7 +389,7 @@ namespace LightGBM { } min_val_ = distinct_values.front(); max_val_ = distinct_values.back(); - std::vector cnt_in_bin; + std::vector cnt_in_bin; // count of data points in each bin. int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { @@ -446,12 +446,12 @@ namespace LightGBM { Log::Warning("Met categorical feature which contains sparse values. " "Consider renumbering to consecutive integers started from zero"); } - // sort by counts + // sort by counts in descending order Common::SortForPair(&counts_int, &distinct_values_int, 0, true); // will ignore the categorical of small counts int cut_cnt = static_cast( Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f)); - size_t cur_cat = 0; + size_t cur_cat_idx = 0; // index of current category. categorical_2_bin_.clear(); bin_2_categorical_.clear(); int used_cnt = 0; @@ -467,20 +467,20 @@ namespace LightGBM { categorical_2_bin_[-1] = 0; cnt_in_bin.push_back(0); num_bin_ = 1; - while (cur_cat < distinct_values_int.size() + while (cur_cat_idx < distinct_values_int.size() && (used_cnt < cut_cnt || num_bin_ < max_bin)) { - if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) { + if (counts_int[cur_cat_idx] < min_data_in_bin && cur_cat_idx > 1) { break; } - bin_2_categorical_.push_back(distinct_values_int[cur_cat]); - categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast(num_bin_); - used_cnt += counts_int[cur_cat]; - cnt_in_bin.push_back(counts_int[cur_cat]); + bin_2_categorical_.push_back(distinct_values_int[cur_cat_idx]); + categorical_2_bin_[distinct_values_int[cur_cat_idx]] = static_cast(num_bin_); + used_cnt += counts_int[cur_cat_idx]; + cnt_in_bin.push_back(counts_int[cur_cat_idx]); ++num_bin_; - ++cur_cat; + ++cur_cat_idx; } // Use MissingType::None to represent this bin contains all categoricals - if (cur_cat == distinct_values_int.size() && na_cnt == 0) { + if (cur_cat_idx == distinct_values_int.size() && na_cnt == 0) { missing_type_ = MissingType::None; } else { missing_type_ = MissingType::NaN;