Skip to content

Commit

Permalink
Fix add features (#2754)
Browse files Browse the repository at this point in the history
* fix subset bug

* typo

* add fixme tag

* bin mapper

* fix test

* fix add_features_from

* Update dataset.cpp

* fix merge bug

* added Python merge code

* added test for add_features

* Update dataset.cpp

* Update src/io/dataset.cpp

* continue implementing

* warn users about categorical features

Co-authored-by: StrikerRUS <[email protected]>
Co-authored-by: Nikita Titov <[email protected]>
  • Loading branch information
3 people authored Oct 26, 2020
1 parent ceb6265 commit 53977f3
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 117 deletions.
97 changes: 58 additions & 39 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_
Expand All @@ -17,7 +18,8 @@ namespace LightGBM {

class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
/*! \brief Using to store data and providing some operations on one feature
* group*/
class FeatureGroup {
public:
friend Dataset;
Expand Down Expand Up @@ -83,13 +85,13 @@ class FeatureGroup {
}

/*!
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
Expand Down Expand Up @@ -122,9 +124,11 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
Expand All @@ -141,18 +145,20 @@ class FeatureGroup {
}

/*! \brief Destructor */
~FeatureGroup() {
}
~FeatureGroup() {}

/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
return;
}
if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
bin -= 1;
}
Expand Down Expand Up @@ -184,6 +190,23 @@ class FeatureGroup {
}
}

void AddFeaturesFrom(const FeatureGroup* other) {
CHECK(is_multi_val_);
CHECK(other->is_multi_val_);
for (int i = 0; i < other->num_feature_; ++i) {
const auto& other_bin_mapper = other->bin_mappers_[i];
bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
auto num_bin = other_bin_mapper->num_bin();
if (other_bin_mapper->GetMostFreqBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
}
num_feature_ += other->num_feature_;
}

inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
Expand All @@ -194,14 +217,15 @@ class FeatureGroup {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
most_freq_bin);
}
}

inline void FinishLoad() {
if (is_multi_val_) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad();
Expand All @@ -213,11 +237,6 @@ class FeatureGroup {
}
}

/*!
* \brief Returns a BinIterator that can access the entire feature group's raw data.
* The RawGet() function of the iterator should be called for best efficiency.
* \return A pointer to the BinIterator object
*/
inline BinIterator* FeatureGroupIterator() {
if (is_multi_val_) {
return nullptr;
Expand Down Expand Up @@ -288,18 +307,18 @@ class FeatureGroup {
}

/*!
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}

/*!
* \brief Save binary data to file
* \param file File want to write
*/
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
Expand All @@ -317,8 +336,8 @@ class FeatureGroup {
}

/*!
* \brief Get sizes in byte of this object
*/
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
Expand Down Expand Up @@ -377,8 +396,9 @@ class FeatureGroup {
}
is_multi_val_ = true;
} else {
if (force_sparse || (!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
if (force_sparse ||
(!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
Expand All @@ -404,7 +424,6 @@ class FeatureGroup {
int num_total_bin_;
};


} // namespace LightGBM

#endif // LIGHTGBM_FEATURE_GROUP_H_
#endif // LIGHTGBM_FEATURE_GROUP_H_
70 changes: 70 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,6 +1904,76 @@ def add_features_from(self, other):
if self.handle is None or other.handle is None:
raise ValueError('Both source and target Datasets must be constructed before adding features')
_safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
was_none = self.data is None
old_self_data_type = type(self.data).__name__
if other.data is None:
self.data = None
elif self.data is not None:
if isinstance(self.data, np.ndarray):
if isinstance(other.data, np.ndarray):
self.data = np.hstack((self.data, other.data))
elif scipy.sparse.issparse(other.data):
self.data = np.hstack((self.data, other.data.toarray()))
elif isinstance(other.data, DataFrame):
self.data = np.hstack((self.data, other.data.values))
elif isinstance(other.data, DataTable):
self.data = np.hstack((self.data, other.data.to_numpy()))
else:
self.data = None
elif scipy.sparse.issparse(self.data):
sparse_format = self.data.getformat()
if isinstance(other.data, np.ndarray) or scipy.sparse.issparse(other.data):
self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
elif isinstance(other.data, DataFrame):
self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
elif isinstance(other.data, DataTable):
self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
else:
self.data = None
elif isinstance(self.data, DataFrame):
if not PANDAS_INSTALLED:
raise LightGBMError("Cannot add features to DataFrame type of raw data "
"without pandas installed")
from pandas import concat
if isinstance(other.data, np.ndarray):
self.data = concat((self.data, DataFrame(other.data)),
axis=1, ignore_index=True)
elif scipy.sparse.issparse(other.data):
self.data = concat((self.data, DataFrame(other.data.toarray())),
axis=1, ignore_index=True)
elif isinstance(other.data, DataFrame):
self.data = concat((self.data, other.data),
axis=1, ignore_index=True)
elif isinstance(other.data, DataTable):
self.data = concat((self.data, DataFrame(other.data.to_numpy())),
axis=1, ignore_index=True)
else:
self.data = None
elif isinstance(self.data, DataTable):
if isinstance(other.data, np.ndarray):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data)))
elif scipy.sparse.issparse(other.data):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
elif isinstance(other.data, DataFrame):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
elif isinstance(other.data, DataTable):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
else:
self.data = None
else:
self.data = None
if self.data is None:
err_msg = ("Cannot add features from {} type of raw data to "
"{} type of raw data.\n").format(type(other.data).__name__,
old_self_data_type)
err_msg += ("Set free_raw_data=False when construct Dataset to avoid this"
if was_none else "Freeing raw data")
warnings.warn(err_msg)
self.feature_name = self.get_feature_name()
warnings.warn("Reseting categorical features.\n"
"You can set new categorical features via ``set_categorical_feature`` method")
self.categorical_feature = "auto"
self.pandas_categorical = None
return self

def _dump_text(self, filename):
Expand Down
Loading

0 comments on commit 53977f3

Please sign in to comment.