Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

renamed cur_cat => cur_cat_idx and added some comments #5522

Merged
merged 5 commits into from
Oct 11, 2022
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ namespace LightGBM {
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::vector<int> counts; // count of data points for each distinct feature value.

std::stable_sort(values, values + num_sample_values);

Expand Down Expand Up @@ -389,7 +389,7 @@ namespace LightGBM {
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
std::vector<int> cnt_in_bin; // count of data points in each bin.
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
Expand Down Expand Up @@ -446,12 +446,12 @@ namespace LightGBM {
Log::Warning("Met categorical feature which contains sparse values. "
"Consider renumbering to consecutive integers started from zero");
}
// sort by counts
// sort by counts in descending order
Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>(
Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
size_t cur_cat = 0;
size_t cur_cat_idx = 0; // index of current category.
categorical_2_bin_.clear();
bin_2_categorical_.clear();
int used_cnt = 0;
Expand All @@ -467,20 +467,20 @@ namespace LightGBM {
categorical_2_bin_[-1] = 0;
cnt_in_bin.push_back(0);
num_bin_ = 1;
while (cur_cat < distinct_values_int.size()
while (cur_cat_idx < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) {
if (counts_int[cur_cat_idx] < min_data_in_bin && cur_cat_idx > 1) {
break;
}
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
bin_2_categorical_.push_back(distinct_values_int[cur_cat_idx]);
categorical_2_bin_[distinct_values_int[cur_cat_idx]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat_idx];
cnt_in_bin.push_back(counts_int[cur_cat_idx]);
++num_bin_;
++cur_cat;
++cur_cat_idx;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
if (cur_cat_idx == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
Expand Down