Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix add features #2754

Merged
merged 28 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a9974b6
fix subset bug
guolinke Feb 7, 2020
1e9af5b
typo
guolinke Feb 7, 2020
b6bb36c
add fixme tag
guolinke Feb 7, 2020
3741002
bin mapper
guolinke Feb 7, 2020
acccd9f
fix test
guolinke Feb 7, 2020
bd33f8a
fix add_features_from
guolinke Feb 8, 2020
5d7dbf1
fixed conflict
StrikerRUS Feb 8, 2020
706d1dd
Update dataset.cpp
guolinke Feb 10, 2020
85a9e3f
Merge branch 'master' into fix-add-features
guolinke Feb 10, 2020
fe506a0
fix merge bug
guolinke Feb 10, 2020
0f57239
Merge branch 'master' into fix-add-features
guolinke Feb 15, 2020
a086c0d
Merge branch 'master' into fix-add-features
guolinke Feb 19, 2020
604c41e
Merge branch 'master' into fix-add-features
guolinke Feb 23, 2020
278031b
Merge branch 'master' into fix-add-features
guolinke Mar 4, 2020
6a07005
added Python merge code
StrikerRUS Mar 4, 2020
16dae64
Merge branch 'master' into fix-add-features
StrikerRUS Mar 6, 2020
3cb29ad
added test for add_features
StrikerRUS Apr 19, 2020
f6b440a
merge from master
guolinke Apr 25, 2020
7799f9a
Merge branch 'master' into fix-add-features
StrikerRUS Jun 11, 2020
98ed3e5
Update dataset.cpp
guolinke Jul 7, 2020
2874ba3
Merge branch 'master' into fix-add-features
StrikerRUS Jul 7, 2020
4b20942
Update src/io/dataset.cpp
guolinke Jul 12, 2020
1427314
Merge branch 'master' into fix-add-features
guolinke Aug 5, 2020
26dfd5e
Merge branch 'master' into fix-add-features
StrikerRUS Sep 6, 2020
79666fb
Merge branch 'master' into fix-add-features
StrikerRUS Oct 5, 2020
32526e5
continue implementing
StrikerRUS Oct 5, 2020
833b351
warn users about categorical features
StrikerRUS Oct 5, 2020
c2c3bf3
Merge branch 'master' into fix-add-features
guolinke Oct 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 60 additions & 40 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_
Expand All @@ -17,7 +18,8 @@ namespace LightGBM {

class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
/*! \brief Using to store data and providing some operations on one feature
* group*/
class FeatureGroup {
public:
friend Dataset;
Expand Down Expand Up @@ -83,13 +85,13 @@ class FeatureGroup {
}

/*!
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
Expand Down Expand Up @@ -122,9 +124,11 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
Expand All @@ -141,18 +145,20 @@ class FeatureGroup {
}

/*! \brief Destructor */
~FeatureGroup() {
}
~FeatureGroup() {}

/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
return;
}
if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
bin -= 1;
}
Expand Down Expand Up @@ -184,6 +190,23 @@ class FeatureGroup {
}
}

void AddFeaturesFrom(const FeatureGroup* other) {
CHECK(is_multi_val_);
CHECK(other->is_multi_val_);
for (int i = 0; i < other->num_feature_; ++i) {
const auto& other_bin_mapper = other->bin_mappers_[i];
bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
auto num_bin = other_bin_mapper->num_bin();
if (other_bin_mapper->GetMostFreqBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
}
num_feature_ += other->num_feature_;
}

inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
Expand All @@ -194,14 +217,15 @@ class FeatureGroup {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
most_freq_bin);
}
}

inline void FinishLoad() {
if (is_multi_val_) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad();
Expand All @@ -213,11 +237,6 @@ class FeatureGroup {
}
}

/*!
* \brief Returns a BinIterator that can access the entire feature group's raw data.
* The RawGet() function of the iterator should be called for best efficiency.
* \return A pointer to the BinIterator object
*/
inline BinIterator* FeatureGroupIterator() {
if (is_multi_val_) {
return nullptr;
Expand Down Expand Up @@ -277,18 +296,18 @@ class FeatureGroup {
}

/*!
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}

/*!
* \brief Save binary data to file
* \param file File want to write
*/
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_));
Expand All @@ -306,10 +325,11 @@ class FeatureGroup {
}

/*!
* \brief Get sizes in byte of this object
*/
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
size_t ret =
sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
Expand Down Expand Up @@ -364,8 +384,9 @@ class FeatureGroup {
}
is_multi_val_ = true;
} else {
if (force_sparse || (!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
if (force_sparse ||
(!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
Expand All @@ -391,7 +412,6 @@ class FeatureGroup {
int num_total_bin_;
};


} // namespace LightGBM

#endif // LIGHTGBM_FEATURE_GROUP_H_
#endif // LIGHTGBM_FEATURE_GROUP_H_
65 changes: 65 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1889,6 +1889,71 @@ def add_features_from(self, other):
if self.handle is None or other.handle is None:
raise ValueError('Both source and target Datasets must be constructed before adding features')
_safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
was_none = self.data is None
old_self_data_type = type(self.data).__name__
if other.data is None:
self.data = None
elif self.data is not None:
if isinstance(self.data, np.ndarray):
if isinstance(other.data, np.ndarray):
self.data = np.hstack((self.data, other.data))
elif scipy.sparse.issparse(other.data):
self.data = np.hstack((self.data, other.data.toarray()))
elif isinstance(other.data, DataFrame):
self.data = np.hstack((self.data, other.data.values))
elif isinstance(other.data, DataTable):
self.data = np.hstack((self.data, other.data.to_numpy()))
else:
self.data = None
elif scipy.sparse.issparse(self.data):
sparse_format = self.data.getformat()
if isinstance(other.data, np.ndarray) or scipy.sparse.issparse(other.data):
self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
elif isinstance(other.data, DataFrame):
self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
elif isinstance(other.data, DataTable):
self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
else:
self.data = None
elif isinstance(self.data, DataFrame):
if not PANDAS_INSTALLED:
raise LightGBMError("Cannot add features to DataFrame type of raw data "
"without pandas installed")
from pandas import concat
if isinstance(other.data, np.ndarray):
self.data = concat((self.data, DataFrame(other.data)),
axis=1, ignore_index=True)
elif scipy.sparse.issparse(other.data):
self.data = concat((self.data, DataFrame(other.data.toarray())),
axis=1, ignore_index=True)
elif isinstance(other.data, DataFrame):
self.data = concat((self.data, other.data),
axis=1, ignore_index=True)
elif isinstance(other.data, DataTable):
self.data = concat((self.data, DataFrame(other.data.to_numpy())),
axis=1, ignore_index=True)
else:
self.data = None
elif isinstance(self.data, DataTable):
if isinstance(other.data, np.ndarray):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data)))
elif scipy.sparse.issparse(other.data):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
elif isinstance(other.data, DataFrame):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
elif isinstance(other.data, DataTable):
self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
else:
self.data = None
else:
self.data = None
if self.data is None:
err_msg = ("Cannot add features from {} type of raw data to "
"{} type of raw data.\n").format(type(other.data).__name__,
old_self_data_type)
err_msg += ("Set free_raw_data=False when construct Dataset to avoid this"
if was_none else "Freeing raw data")
warnings.warn(err_msg)
return self

def _dump_text(self, filename):
Expand Down
Loading