Skip to content

Commit

Permalink
[cherry-pick]XPUPS add support for kunlun2 (#41916)
Browse files Browse the repository at this point in the history
* [XPUPS]add support for kunlun2 (#40985)


[XPUPS]add support for kunlun2

Co-authored-by: WorgenZhang <[email protected]>

* [XPUPS]fix hashtable_kernel.kps (#41790)

* refactor heter comm kernel

* update. test=develop

* update calc_shard_offset. test=develop

* update xpu kernel. test=develop

* update args of calc_shard_offset

* update. test=develop

* remove customGradMerger

* update. test=develop

* update. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop

* update optimizer kernel

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* add optimizer kernel. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix kunlun not support size_t. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update hashtable. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* template init. test=develop

* hashtable template init. test=develop

* fix. test=develop

* fix. test=devlop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix hashtable_kernel. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

Co-authored-by: WorgenZhang <[email protected]>

* [XPUPS]modify xpu_kp.cmake with HETERPS&PSLIB (#41760)

* modify xpu_kp.cmake with HETERPS&PSLIB

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

Co-authored-by: WorgenZhang <[email protected]>
  • Loading branch information
zmxdream and WorgenZhang authored Apr 18, 2022
1 parent 8ccdb91 commit 3a2fb4c
Show file tree
Hide file tree
Showing 21 changed files with 1,951 additions and 393 deletions.
10 changes: 8 additions & 2 deletions cmake/xpu_kp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ macro(compile_kernel COMPILE_ARGS)
string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")

set(ABI_VERSION "")
if(WITH_HETERPS AND WITH_PSLIB)
set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=0")
else()
set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1")
endif()
add_custom_command(
OUTPUT
kernel_build/${kernel_name}.bin.o
Expand All @@ -130,7 +136,7 @@ macro(compile_kernel COMPILE_ARGS)
COMMAND
${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
--xpu-device-only -c -v
COMMAND
Expand All @@ -153,7 +159,7 @@ macro(compile_kernel COMPILE_ARGS)
COMMAND
${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
--xpu-host-only -c -v
WORKING_DIRECTORY
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/fleet/heter_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ limitations under the License. */
#include <vector>

#ifdef PADDLE_WITH_PSLIB
#include "common_value.h" // NOLINT
#include "common/common_value.h" // NOLINT
#endif

#ifdef PADDLE_WITH_PSCORE
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ IF(WITH_GPU)
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
endif()
nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
if(WITH_PSCORE)
Expand Down
24 changes: 12 additions & 12 deletions paddle/fluid/framework/fleet/heter_ps/feature_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,18 @@ struct FeaturePushValue {
float lr_g;
float mf_g[MF_DIM];

__device__ __forceinline__ FeaturePushValue
operator+(const FeaturePushValue& a) const {
FeaturePushValue out;
out.slot = a.slot;
out.show = a.show + show;
out.clk = a.clk + clk;
out.lr_g = a.lr_g + lr_g;
for (int i = 0; i < MF_DIM; ++i) {
out.mf_g[i] = a.mf_g[i] + mf_g[i];
}
return out;
}
// __device__ __forceinline__ FeaturePushValue
// operator+(const FeaturePushValue& a) const {
// FeaturePushValue out;
// out.slot = a.slot;
// out.show = a.show + show;
// out.clk = a.clk + clk;
// out.lr_g = a.lr_g + lr_g;
// for (int i = 0; i < MF_DIM; ++i) {
// out.mf_g[i] = a.mf_g[i] + mf_g[i];
// }
// return out;
// }
};

} // end namespace framework
Expand Down
98 changes: 82 additions & 16 deletions paddle/fluid/framework/fleet/heter_ps/hashtable.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,38 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#ifdef PADDLE_WITH_HETERPS
#include <glog/logging.h>
#include <limits>
#include <memory>
#include <vector>

#ifdef PADDLE_WITH_PSLIB
#include "common_value.h" // NOLINT
#endif
#ifdef PADDLE_WITH_PSCORE

#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/phi/core/utils/rw_lock.h"
#include "thrust/pair.h"
// #include "cudf/concurrent_unordered_map.cuh.h"

#if defined(PADDLE_WITH_CUDA)
#include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "thrust/pair.h"
#elif defined(__xpu__)
#include <xpu/runtime.h>
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/math.h"
#include "xpu/kernel/simd.h"
#endif

namespace paddle {
namespace framework {

#if defined(PADDLE_WITH_CUDA)
template <typename KeyType, typename ValType>
class TableContainer
: public concurrent_unordered_map<KeyType, ValType,
Expand All @@ -45,31 +55,84 @@ class TableContainer
std::numeric_limits<KeyType>::max()>(
capacity, ValType()) {}
};
#elif defined(PADDLE_WITH_XPU_KP)

template <typename KeyType, typename ValType>
class XPUCacheArray {
public:
explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
}

virtual ~XPUCacheArray() {
xpu_free(keys);
xpu_free(vals);
}

void print() {}
// ValType* find(const KeyType& key) { return NULL; }
// bool insert(const KeyType& key, const ValType& val) { return true; }

int prefetch(const int dev_id, XPUStream stream = NULL) { return 0; }
size_t size() { return size_; }

private:
long long capacity_;
long long size_;
KeyType* keys;
ValType* vals;
};
#endif

template <typename KeyType, typename ValType>
class HashTable {
public:
HashTable(size_t capacity);
explicit HashTable(size_t capacity);
virtual ~HashTable();
HashTable(const HashTable&) = delete;
HashTable& operator=(const HashTable&) = delete;

template <typename StreamType>
void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
gpuStream_t stream);
StreamType stream);

template <typename StreamType>
void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
gpuStream_t stream);
StreamType stream);

template <typename StreamType>
void get(const KeyType* d_keys, ValType* d_vals, size_t len,
gpuStream_t stream);
void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
StreamType stream);

template <typename StreamType>
void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);

void show();
void dump_to_cpu(int devid, cudaStream_t stream);

template <typename GradType, typename Sgd>
template <typename StreamType>
void dump_to_cpu(int devid, StreamType stream);

#if defined(PADDLE_WITH_CUDA)

template <typename GradType, typename Sgd, typename StreamType>
void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
Sgd sgd, gpuStream_t stream);
Sgd sgd, StreamType stream);

template <typename Sgd>
template <typename Sgd, typename StreamType>
void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
gpuStream_t stream);
StreamType stream);

#elif defined(PADDLE_WITH_XPU_KP)
template <typename GradType, typename StreamType>
void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
StreamType stream);

template <typename StreamType>
void update(const KeyType* d_keys, const char* d_grads, size_t len,
StreamType stream);

#endif

int size() { return container_->size(); }

Expand All @@ -84,7 +147,11 @@ class HashTable {
std::unique_ptr<phi::RWLock> rwlock_{nullptr};

private:
#if defined(PADDLE_WITH_CUDA)
TableContainer<KeyType, ValType>* container_;
#elif defined(PADDLE_WITH_XPU_KP)
XPUCacheArray<KeyType, ValType>* container_;
#endif
int BLOCK_SIZE_{256};
float LOAD_FACTOR{0.75f};
size_t capacity_;
Expand All @@ -94,5 +161,4 @@ class HashTable {
};
} // end namespace framework
} // end namespace paddle
#include "hashtable_inl.h"
#endif
Loading

0 comments on commit 3a2fb4c

Please sign in to comment.