-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
cuda_row_data.hpp
179 lines (144 loc) · 6.63 KB
/
cuda_row_data.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#ifndef LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
#define LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/dataset.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <vector>
#define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024)
#if CUDART_VERSION == 10000
#define DP_SHARED_HIST_SIZE (5560)
#else
#define DP_SHARED_HIST_SIZE (6144)
#endif
#define SP_SHARED_HIST_SIZE (DP_SHARED_HIST_SIZE * 2)
namespace LightGBM {
class CUDARowData {
public:
CUDARowData(const Dataset* train_data,
const TrainingShareStates* train_share_state,
const int gpu_device_id,
const bool gpu_use_dp);
~CUDARowData();
void Init(const Dataset* train_data,
TrainingShareStates* train_share_state);
void CopySubrow(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
void CopySubcol(const CUDARowData* full_set, const std::vector<int8_t>& is_feature_used, const Dataset* train_data);
void CopySubrowAndSubcol(const CUDARowData* full_set, const data_size_t* used_indices,
const data_size_t num_used_indices, const std::vector<bool>& is_feature_used, const Dataset* train_data);
template <typename BIN_TYPE>
const BIN_TYPE* GetBin() const;
template <typename PTR_TYPE>
const PTR_TYPE* GetPartitionPtr() const;
template <typename PTR_TYPE>
const PTR_TYPE* GetRowPtr() const;
int NumLargeBinPartition() const { return static_cast<int>(large_bin_partitions_.size()); }
int num_feature_partitions() const { return num_feature_partitions_; }
int max_num_column_per_partition() const { return max_num_column_per_partition_; }
bool is_sparse() const { return is_sparse_; }
uint8_t bit_type() const { return bit_type_; }
uint8_t row_ptr_bit_type() const { return row_ptr_bit_type_; }
const int* cuda_feature_partition_column_index_offsets() const { return cuda_feature_partition_column_index_offsets_; }
const uint32_t* cuda_column_hist_offsets() const { return cuda_column_hist_offsets_; }
const uint32_t* cuda_partition_hist_offsets() const { return cuda_partition_hist_offsets_; }
int shared_hist_size() const { return shared_hist_size_; }
private:
void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state);
template <typename BIN_TYPE>
void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector<BIN_TYPE>* partitioned_data);
template <typename BIN_TYPE, typename ROW_PTR_TYPE>
void GetSparseDataPartitioned(const BIN_TYPE* row_wise_data,
const ROW_PTR_TYPE* row_ptr,
std::vector<std::vector<BIN_TYPE>>* partitioned_data,
std::vector<std::vector<ROW_PTR_TYPE>>* partitioned_row_ptr,
std::vector<ROW_PTR_TYPE>* partition_ptr);
template <typename BIN_TYPE, typename ROW_PTR_TYPE>
void InitSparseData(const BIN_TYPE* host_data,
const ROW_PTR_TYPE* host_row_ptr,
BIN_TYPE** cuda_data,
ROW_PTR_TYPE** cuda_row_ptr,
ROW_PTR_TYPE** cuda_partition_ptr);
/*! \brief number of threads to use */
int num_threads_;
/*! \brief number of training data */
data_size_t num_data_;
/*! \brief number of bins of all features */
int num_total_bin_;
/*! \brief number of feature groups in dataset */
int num_feature_group_;
/*! \brief number of features in dataset */
int num_feature_;
/*! \brief number of bits used to store each bin value */
uint8_t bit_type_;
/*! \brief number of bits used to store each row pointer value */
uint8_t row_ptr_bit_type_;
/*! \brief is sparse row wise data */
bool is_sparse_;
/*! \brief start column index of each feature partition */
std::vector<int> feature_partition_column_index_offsets_;
/*! \brief histogram offset of each column */
std::vector<uint32_t> column_hist_offsets_;
/*! \brief hisotgram offset of each partition */
std::vector<uint32_t> partition_hist_offsets_;
/*! \brief maximum number of columns among all feature partitions */
int max_num_column_per_partition_;
/*! \brief number of partitions */
int num_feature_partitions_;
/*! \brief used when bagging with subset, number of used indices */
data_size_t num_used_indices_;
/*! \brief used when bagging with subset, number of total elements */
uint64_t num_total_elements_;
/*! \brief used when bagging with column subset, the size of maximum number of feature partitions */
int cur_num_feature_partition_buffer_size_;
/*! \brief CUDA device ID */
int gpu_device_id_;
/*! \brief index of partitions with large bins that its histogram cannot fit into shared memory, each large bin partition contains a single column */
std::vector<int> large_bin_partitions_;
/*! \brief index of partitions with small bins */
std::vector<int> small_bin_partitions_;
/*! \brief shared memory size used by histogram */
int shared_hist_size_;
/*! \brief whether to use double precision in histograms per block */
bool gpu_use_dp_;
// CUDA memory
/*! \brief row-wise data stored in CUDA, 8 bits */
uint8_t* cuda_data_uint8_t_;
/*! \brief row-wise data stored in CUDA, 16 bits */
uint16_t* cuda_data_uint16_t_;
/*! \brief row-wise data stored in CUDA, 32 bits */
uint32_t* cuda_data_uint32_t_;
/*! \brief row pointer stored in CUDA, 16 bits */
uint16_t* cuda_row_ptr_uint16_t_;
/*! \brief row pointer stored in CUDA, 32 bits */
uint32_t* cuda_row_ptr_uint32_t_;
/*! \brief row pointer stored in CUDA, 64 bits */
uint64_t* cuda_row_ptr_uint64_t_;
/*! \brief partition bin offsets, 16 bits */
uint16_t* cuda_partition_ptr_uint16_t_;
/*! \brief partition bin offsets, 32 bits */
uint32_t* cuda_partition_ptr_uint32_t_;
/*! \brief partition bin offsets, 64 bits */
uint64_t* cuda_partition_ptr_uint64_t_;
/*! \brief start column index of each feature partition */
int* cuda_feature_partition_column_index_offsets_;
/*! \brief histogram offset of each column */
uint32_t* cuda_column_hist_offsets_;
/*! \brief hisotgram offset of each partition */
uint32_t* cuda_partition_hist_offsets_;
/*! \brief block buffer when calculating prefix sum */
uint16_t* cuda_block_buffer_uint16_t_;
/*! \brief block buffer when calculating prefix sum */
uint32_t* cuda_block_buffer_uint32_t_;
/*! \brief block buffer when calculating prefix sum */
uint64_t* cuda_block_buffer_uint64_t_;
};
} // namespace LightGBM
#endif // LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
#endif // USE_CUDA