Skip to content

Commit

Permalink
[SYCL] Improve performance of generic shuffles (#3815)
Browse files Browse the repository at this point in the history
* Fix upper bound in GenericCall

Previous upper bound considered only the offset, allowing a memcpy
for the final chunk to walk off the end of the byte array.

* Replace detail::memcpy with std::memcpy

sycl::detail::memcpy is implemented as a loop, resulting in different
optimizations than std::memcpy.

Signed-off-by: John Pennycook <[email protected]>
  • Loading branch information
Pennycook authored Jul 21, 2021
1 parent e9d308e commit fb08adf
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions sycl/include/CL/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ template <typename T, typename Functor>
void GenericCall(const Functor &ApplyToBytes) {
if (sizeof(T) >= sizeof(ShuffleChunkT)) {
#pragma unroll
for (size_t Offset = 0; Offset < sizeof(T);
for (size_t Offset = 0; Offset + sizeof(ShuffleChunkT) <= sizeof(T);
Offset += sizeof(ShuffleChunkT)) {
ApplyToBytes(Offset, sizeof(ShuffleChunkT));
}
Expand Down Expand Up @@ -160,9 +160,9 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
uint64_t BroadcastX, BroadcastResult;
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
std::memcpy(&BroadcastX, XBytes + Offset, Size);
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
};
GenericCall<T>(BroadcastBytes);
return Result;
Expand Down Expand Up @@ -213,9 +213,9 @@ EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
uint64_t BroadcastX, BroadcastResult;
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
std::memcpy(&BroadcastX, XBytes + Offset, Size);
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
};
GenericCall<T>(BroadcastBytes);
return Result;
Expand Down Expand Up @@ -697,9 +697,9 @@ EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
return Result;
Expand All @@ -712,9 +712,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
return Result;
Expand All @@ -727,9 +727,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
return Result;
Expand All @@ -742,9 +742,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
return Result;
Expand Down

0 comments on commit fb08adf

Please sign in to comment.