Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Add multiclass_ova objective for cuda_exp #5491

Merged
merged 11 commits into from
Nov 6, 2022
2 changes: 1 addition & 1 deletion src/objective/cuda/cuda_binary_objective.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ double CUDABinaryLogloss::BoostFromScore(int) const {
CopyFromCUDADeviceToHost<double>(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__);
double pavg = 0.0f;
CopyFromCUDADeviceToHost<double>(&pavg, cuda_sum_weights_, 1, __FILE__, __LINE__);
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, boost_from_score);
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, boost_from_score);
return boost_from_score;
}

Expand Down
66 changes: 20 additions & 46 deletions src/objective/cuda/cuda_binary_objective.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

namespace LightGBM {

template <bool IS_OVA, bool USE_WEIGHT>
template <bool USE_WEIGHT>
__global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_sum_labels,
double* out_cuda_sum_weights, const label_t* cuda_weights, const int ova_class_id) {
double* out_cuda_sum_weights, const label_t* cuda_weights) {
__shared__ double shared_buffer[32];
const uint32_t mask = 0xffffffff;
const uint32_t warpLane = threadIdx.x % warpSize;
Expand All @@ -27,12 +27,12 @@ __global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels,
if (USE_WEIGHT) {
const label_t cuda_label = cuda_labels[index];
const double sample_weight = cuda_weights[index];
const label_t label = IS_OVA ? (static_cast<int>(cuda_label) == ova_class_id ? 1 : 0) : (cuda_label > 0 ? 1 : 0);
const label_t label = cuda_label > 0 ? 1 : 0;
label_value = label * sample_weight;
weight_value = sample_weight;
} else {
const label_t cuda_label = cuda_labels[index];
label_value = IS_OVA ? (static_cast<int>(cuda_label) == ova_class_id ? 1 : 0) : (cuda_label > 0 ? 1 : 0);
label_value = cuda_label > 0 ? 1 : 0;
}
}
for (uint32_t offset = warpSize / 2; offset >= 1; offset >>= 1) {
Expand Down Expand Up @@ -88,22 +88,13 @@ __global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_sum_labels

void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
const int num_blocks = (num_data_ + CALC_INIT_SCORE_BLOCK_SIZE_BINARY - 1) / CALC_INIT_SCORE_BLOCK_SIZE_BINARY;
if (ova_class_id_ == -1) {
if (cuda_weights_ == nullptr) {
BoostFromScoreKernel_1_BinaryLogloss<false, false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
} else {
BoostFromScoreKernel_1_BinaryLogloss<false, true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
}
SetCUDAMemory<double>(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__);
if (cuda_weights_ == nullptr) {
BoostFromScoreKernel_1_BinaryLogloss<false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_);
} else {
if (cuda_weights_ == nullptr) {
BoostFromScoreKernel_1_BinaryLogloss<true, false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
} else {
BoostFromScoreKernel_1_BinaryLogloss<true, true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
}
BoostFromScoreKernel_1_BinaryLogloss<true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
(cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_);
}
SynchronizeCUDADevice(__FILE__, __LINE__);
if (cuda_weights_ == nullptr) {
Expand All @@ -114,15 +105,15 @@ void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
SynchronizeCUDADevice(__FILE__, __LINE__);
}

template <bool USE_LABEL_WEIGHT, bool USE_WEIGHT, bool IS_OVA>
template <bool USE_LABEL_WEIGHT, bool USE_WEIGHT>
__global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, const label_t* cuda_labels,
const double* cuda_label_weights, const label_t* cuda_weights, const int ova_class_id,
const double* cuda_label_weights, const label_t* cuda_weights,
const double sigmoid, const data_size_t num_data,
score_t* cuda_out_gradients, score_t* cuda_out_hessians) {
const data_size_t data_index = static_cast<data_size_t>(blockDim.x * blockIdx.x + threadIdx.x);
if (data_index < num_data) {
const label_t cuda_label = static_cast<int>(cuda_labels[data_index]);
const int label = IS_OVA ? (cuda_label == ova_class_id ? 1 : -1) : (cuda_label > 0 ? 1 : -1);
const int label = cuda_label > 0 ? 1 : -1;
const double response = -label * sigmoid / (1.0f + exp(label * sigmoid * cuda_scores[data_index]));
const double abs_response = fabs(response);
if (!USE_WEIGHT) {
Expand Down Expand Up @@ -153,41 +144,24 @@ __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, cons
cuda_label_, \
cuda_label_weights_, \
cuda_weights_, \
ova_class_id_, \
sigmoid_, \
num_data_, \
gradients, \
hessians

void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const {
const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY;
if (ova_class_id_ == -1) {
if (cuda_label_weights_ == nullptr) {
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<false, false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
GetGradientsKernel_BinaryLogloss<false, true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
if (cuda_label_weights_ == nullptr) {
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<true, false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
GetGradientsKernel_BinaryLogloss<true, true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
GetGradientsKernel_BinaryLogloss<false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
} else {
if (cuda_label_weights_ == nullptr) {
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<false, false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
GetGradientsKernel_BinaryLogloss<false, true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
if (cuda_weights_ == nullptr) {
GetGradientsKernel_BinaryLogloss<true, false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
} else {
GetGradientsKernel_BinaryLogloss<true, true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
GetGradientsKernel_BinaryLogloss<true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
}
}
}
Expand Down
31 changes: 31 additions & 0 deletions src/objective/cuda/cuda_multiclass_objective.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,37 @@ void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const
}


CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config): MulticlassOVA(config) {
for (int i = 0; i < num_class_; ++i) {
cuda_binary_loss_.emplace_back(new CUDABinaryLogloss(config, i));
}
}

CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector<std::string>& strs): MulticlassOVA(strs) {}

CUDAMulticlassOVA::~CUDAMulticlassOVA() {}

void CUDAMulticlassOVA::Init(const Metadata& metadata, data_size_t num_data) {
MulticlassOVA::Init(metadata, num_data);
for (int i = 0; i < num_class_; ++i) {
cuda_binary_loss_[i]->Init(metadata, num_data);
}
}

void CUDAMulticlassOVA::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
for (int i = 0; i < num_class_; ++i) {
int64_t offset = static_cast<int64_t>(num_data_) * i;
cuda_binary_loss_[i]->GetGradients(score + offset, gradients + offset, hessians + offset);
}
}

void CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
for (int i = 0; i < num_class_; ++i) {
cuda_binary_loss_[i]->ConvertOutputCUDA(num_data, input + i * num_data, output + i * num_data);
}
}


} // namespace LightGBM

#endif // USE_CUDA_EXP
32 changes: 32 additions & 0 deletions src/objective/cuda/cuda_multiclass_objective.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

#include <LightGBM/cuda/cuda_objective_function.hpp>

#include <memory>
#include <string>
#include <vector>

#include "cuda_binary_objective.hpp"

#include "../multiclass_objective.hpp"

#define GET_GRADIENTS_BLOCK_SIZE_MULTICLASS (1024)
Expand Down Expand Up @@ -54,6 +57,35 @@ class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSof
};


class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA {
public:
explicit CUDAMulticlassOVA(const Config& config);

explicit CUDAMulticlassOVA(const std::vector<std::string>& strs);

void Init(const Metadata& metadata, data_size_t num_data) override;

void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;

void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;

double BoostFromScore(int class_id) const override {
return cuda_binary_loss_[class_id]->BoostFromScore(0);
}

bool ClassNeedTrain(int class_id) const override {
return cuda_binary_loss_[class_id]->ClassNeedTrain(0);
}

~CUDAMulticlassOVA();

bool IsCUDAObjective() const override { return true; }

private:
std::vector<std::unique_ptr<CUDABinaryLogloss>> cuda_binary_loss_;
};


} // namespace LightGBM

#endif // USE_CUDA_EXP
Expand Down
2 changes: 1 addition & 1 deletion src/objective/multiclass_objective.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ class MulticlassOVA: public ObjectiveFunction {
return binary_loss_[class_id]->ClassNeedTrain(0);
}

private:
protected:
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Number of classes */
Expand Down
3 changes: 1 addition & 2 deletions src/objective/objective_function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
} else if (type == std::string("multiclass")) {
return new CUDAMulticlassSoftmax(config);
} else if (type == std::string("multiclassova")) {
Log::Warning("Objective multiclassova is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new MulticlassOVA(config);
return new CUDAMulticlassOVA(config);
} else if (type == std::string("cross_entropy")) {
Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new CrossEntropy(config);
Expand Down