-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change device handle interfaces & others (#142)
* Changed device handle interfaces * Changed proxy service interfaces * Move device code into separate files * Fixed FIFO polling issues * Add configuration arguments in several interface functions --------- Co-authored-by: Changho Hwang <[email protected]> Co-authored-by: Binyang Li <[email protected]> Co-authored-by: root <root@a100-saemal0.qxveptpukjsuthqvv514inp03c.gx.internal.cloudapp.net>
- Loading branch information
1 parent
4865b20
commit 8d1b984
Showing
59 changed files
with
1,269 additions
and
1,034 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT license. | ||
|
||
# Add targets to run clang-format and black | ||
|
||
add_custom_target(check-format) | ||
add_custom_target(format) | ||
|
||
find_program(CLANG_FORMAT clang-format) | ||
if(CLANG_FORMAT) | ||
message(STATUS "Found clang-format: ${CLANG_FORMAT}") | ||
set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test) | ||
add_custom_target(check-format-cpp ALL | ||
COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` | ||
) | ||
add_dependencies(check-format check-format-cpp) | ||
add_custom_target(format-cpp | ||
COMMAND ${CLANG_FORMAT} -style=file -i `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` | ||
) | ||
add_dependencies(format format-cpp) | ||
else() | ||
message(STATUS "clang-format not found.") | ||
endif() | ||
|
||
find_program(BLACK black) | ||
if (BLACK) | ||
message(STATUS "Found black: ${BLACK}") | ||
add_custom_target(check-format-py | ||
COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test | ||
) | ||
add_dependencies(check-format check-format-py) | ||
add_custom_target(format-py | ||
COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test | ||
) | ||
add_dependencies(format format-py) | ||
else() | ||
message(STATUS, "black not found.") | ||
endif() |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
#ifndef MSCCLPP_FIFO_DEVICE_HPP_ | ||
#define MSCCLPP_FIFO_DEVICE_HPP_ | ||
|
||
#include "poll.hpp" | ||
|
||
namespace mscclpp { | ||
|
||
/// A struct representing a pair of 64-bit unsigned integers used as a trigger for the proxy. | ||
/// | ||
/// This struct is used as a work element in the concurrent FIFO where multiple device threads can push | ||
/// ProxyTrigger elements and a single host proxy thread consumes these work elements. | ||
/// | ||
/// Do not use the most significant bit of @ref snd as it is reserved for memory consistency purposes | ||
struct alignas(16) ProxyTrigger { | ||
uint64_t fst, snd; | ||
}; | ||
|
||
/// A concurrent FIFO where multiple device threads can push work elements and a single host proxy thread consumes them. | ||
/// | ||
/// The FIFO has a head pointer allocated on the device which starts at 0 and goes up to 2^64-1, which is almost | ||
/// infinity. There are two copies of the tail, one on the device, @ref FifoDeviceHandle::tailReplica, and another on | ||
/// the host, namely, hostTail. The host always has the "true" tail and occasionally pushes it to the copy on the | ||
/// device. Therefore, most of the time, the device has a stale version. The invariants are: tailReplica <= hostTail <= | ||
/// head. The @ref push() function increments head, hostTail is updated in @ref Fifo::pop(), and it occasionally flushes | ||
/// it to tailReplica via @ref Fifo::flushTail(). | ||
/// | ||
/// Duplicating the tail is a good idea because the FIFO is large enough, and we do not need frequent updates for the | ||
/// tail as there is usually enough space for device threads to push their work into. | ||
/// | ||
struct FifoDeviceHandle { | ||
#ifdef __CUDACC__ | ||
/// Push a trigger to the FIFO. | ||
/// | ||
/// @param trigger The trigger to push. | ||
/// @return The new head of the FIFO. | ||
__forceinline__ __device__ uint64_t push(ProxyTrigger trigger) { | ||
uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->head, 1); | ||
// make the last bit intentionally non-zero so that we can safely poll. Don't worry, we will change it back in host | ||
// side | ||
trigger.snd ^= ((uint64_t)1 << (uint64_t)63); | ||
|
||
// Only one of two conditions need to be met to proceed. Either the tail has advanced enough or where we need to | ||
// write to is 0. However, the first condition is faster to check since the tail is flushed periodically anyways but | ||
// for the second condition we need to read CPU memory. | ||
// As volatile access is slow, we first check using the bare pointer and then use the volatile pointer if the | ||
// condition is not met. | ||
if (curFifoHead >= size + *(this->tailReplica)) { | ||
OR_POLL_MAYBE_JAILBREAK(curFifoHead >= size + *((volatile uint64_t*)this->tailReplica), | ||
*(volatile uint64_t*)&this->triggers[curFifoHead % size] != 0, 1000000); | ||
} | ||
|
||
ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % size]); | ||
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); | ||
return curFifoHead; | ||
} | ||
|
||
/// Wait until there is a place in the FIFO to push a trigger. | ||
/// | ||
/// @param curFifoHead The current head of the FIFO. | ||
__forceinline__ __device__ void sync(uint64_t curFifoHead) { | ||
// Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need | ||
// to wait for cudaMemcpy to be done. | ||
OR_POLL_MAYBE_JAILBREAK(*(volatile uint64_t*)&(this->triggers[curFifoHead % size]) != 0, | ||
*(volatile uint64_t*)(this->tailReplica) <= curFifoHead, 1000000); | ||
} | ||
#endif // __CUDACC__ | ||
|
||
/// The FIFO buffer that is allocated on the host via `cudaHostAlloc()`. | ||
ProxyTrigger* triggers; | ||
/// Replica of the FIFO tail that is allocated on device. | ||
uint64_t* tailReplica; | ||
/// The FIFO head. Allocated on the device and only accessed by the device. | ||
uint64_t* head; | ||
/// The FIFO size. | ||
int size; | ||
}; | ||
|
||
} // namespace mscclpp | ||
|
||
#endif // MSCCLPP_FIFO_DEVICE_HPP_ |
File renamed without changes.
Oops, something went wrong.