Skip to content

Commit

Permalink
EC/ROCM: remove unused functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev committed Jan 12, 2023
1 parent cb5fec1 commit 40a1c79
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 300 deletions.
234 changes: 1 addition & 233 deletions src/components/ec/rocm/ec_rocm.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,43 +13,10 @@
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>

static const char *stream_task_modes[] = {
[UCC_EC_ROCM_TASK_KERNEL] = "kernel",
[UCC_EC_ROCM_TASK_MEM_OPS] = "driver",
[UCC_EC_ROCM_TASK_AUTO] = "auto",
[UCC_EC_ROCM_TASK_LAST] = NULL
};

static const char *task_stream_types[] = {
[UCC_EC_ROCM_USER_STREAM] = "user",
[UCC_EC_ROCM_INTERNAL_STREAM] = "ucc",
[UCC_EC_ROCM_TASK_STREAM_LAST] = NULL
};

static ucc_config_field_t ucc_ec_rocm_config_table[] = {
{"", "", NULL, ucc_offsetof(ucc_ec_rocm_config_t, super),
UCC_CONFIG_TYPE_TABLE(ucc_ec_config_table)},

{"STREAM_TASK_MODE", "auto",
"Mechanism to create stream dependency\n"
"kernel - use waiting kernel\n"
"driver - use driver MEM_OPS\n"
"auto - runtime automatically chooses best one",
ucc_offsetof(ucc_ec_rocm_config_t, strm_task_mode),
UCC_CONFIG_TYPE_ENUM(stream_task_modes)},

{"TASK_STREAM", "user",
"Stream for rocm task\n"
"user - user stream provided in execution engine context\n"
"ucc - ucc library internal stream",
ucc_offsetof(ucc_ec_rocm_config_t, task_strm_type),
UCC_CONFIG_TYPE_ENUM(task_stream_types)},

{"STREAM_BLOCKING_WAIT", "1",
"Stream is blocked until collective operation is done",
ucc_offsetof(ucc_ec_rocm_config_t, stream_blocking_wait),
UCC_CONFIG_TYPE_UINT},

{"EXEC_NUM_WORKERS", "1",
"Number of thread blocks to use for rocm executor",
ucc_offsetof(ucc_ec_rocm_config_t, exec_num_workers),
Expand Down Expand Up @@ -138,48 +105,14 @@ static void ucc_ec_rocm_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOL
}
}


static ucc_mpool_ops_t ucc_ec_rocm_ee_executor_mpool_ops = {
.chunk_alloc = ucc_ec_rocm_ee_executor_mpool_chunk_malloc,
.chunk_release = ucc_ec_rocm_ee_executor_mpool_chunk_free,
.obj_init = ucc_ec_rocm_executor_chunk_init,
.obj_cleanup = ucc_ec_rocm_executor_chunk_cleanup,
};


static ucc_status_t ucc_ec_rocm_stream_req_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused
size_t *size_p,
void ** chunk_p)
{
ucc_status_t status;

status = ROCM_FUNC(hipHostMalloc((void**)chunk_p, *size_p,
hipHostMallocMapped));
return status;
}

static void ucc_ec_rocm_stream_req_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused
void * chunk)
{
hipHostFree(chunk);
}

static void ucc_ec_rocm_stream_req_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
{
ucc_ec_rocm_stream_request_t *req = (ucc_ec_rocm_stream_request_t*) obj;

ROCM_FUNC(hipHostGetDevicePointer(
(void**)(&req->dev_status), (void *)&req->status, 0));
}

static ucc_mpool_ops_t ucc_ec_rocm_stream_req_mpool_ops = {
.chunk_alloc = ucc_ec_rocm_stream_req_mpool_chunk_malloc,
.chunk_release = ucc_ec_rocm_stream_req_mpool_chunk_free,
.obj_init = ucc_ec_rocm_stream_req_init,
.obj_cleanup = NULL
};

static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk)
{
ucc_ec_rocm_event_t *base = (ucc_ec_rocm_event_t *) obj;

Expand All @@ -206,37 +139,12 @@ static ucc_mpool_ops_t ucc_ec_rocm_event_mpool_ops = {
.obj_cleanup = ucc_ec_rocm_event_cleanup,
};

ucc_status_t ucc_ec_rocm_post_kernel_stream_task(uint32_t *status,
int blocking_wait,
hipStream_t stream);

static ucc_status_t ucc_ec_rocm_post_driver_stream_task(uint32_t *status,
int blocking_wait,
hipStream_t stream)
{
hipDeviceptr_t status_ptr = (hipDeviceptr_t)status;

if (blocking_wait) {
ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr,
UCC_EC_ROCM_TASK_STARTED, 0));
ROCM_FUNC(hipStreamWaitValue32(stream, status_ptr,
UCC_EC_ROCM_TASK_COMPLETED,
hipStreamWaitValueEq, 0xFFFFFFFF));
}
ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr,
UCC_EC_ROCM_TASK_COMPLETED_ACK, 0));
return UCC_OK;
}

static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
{
ucc_ec_rocm_config_t *cfg = EC_ROCM_CONFIG;
ucc_status_t status;
int device, num_devices;
int attr=0;
hipError_t rocm_st;
hipDevice_t hip_dev;
const char *hip_err_st_str;
hipDeviceProp_t prop;

ucc_ec_rocm.stream = NULL;
Expand Down Expand Up @@ -290,16 +198,6 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
return status;
}

/* create request pool */
status = ucc_mpool_init(
&ucc_ec_rocm.strm_reqs, 0, sizeof(ucc_ec_rocm_stream_request_t), 0,
UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_stream_req_mpool_ops,
UCC_THREAD_MULTIPLE, "ROCM Event Objects");
if (status != UCC_OK) {
ec_error(&ucc_ec_rocm.super, "failed to create stream pool");
return status;
}

status = ucc_mpool_init(
&ucc_ec_rocm.executors, 0, sizeof(ucc_ec_rocm_executor_t), 0,
UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_ee_executor_mpool_ops,
Expand All @@ -314,44 +212,7 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
sizeof(ucc_ec_rocm_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
"interruptible executor tasks");
if (status != UCC_OK) {
ec_error(&ucc_ec_rocm.super, "failed to create interruptible tasks pool");
return status;
}

if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_KERNEL) {
ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL;
ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task;
} else {
ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_MEM_OPS;
ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_driver_stream_task;

rocm_st = hipCtxGetDevice(&hip_dev);
if (rocm_st != hipSuccess){
hip_err_st_str = hipGetErrorString(rocm_st);
ec_debug(&ucc_ec_rocm.super, "hipCtxGetDevice() failed: %s",
hip_err_st_str);
attr = 0;
} else {
ROCM_FUNC(hipDeviceGetAttribute(&attr,
hipDeviceAttributeCanUseStreamWaitValue,
hip_dev));
}
if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_AUTO) {
if (attr == 0) {
ec_info(&ucc_ec_rocm.super,
"ROCm MEM OPS are not supported or disabled");
ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL;
ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task;
}
} else if (attr == 0) {
ec_error(&ucc_ec_rocm.super,
"ROCm MEM OPS are not supported or disabled");
return UCC_ERR_NOT_SUPPORTED;
}
}

ucc_ec_rocm.task_strm_type = cfg->task_strm_type;
ucc_spinlock_init(&ucc_ec_rocm.init_spinlock, 0);
return UCC_OK;
}
Expand All @@ -364,95 +225,6 @@ static ucc_status_t ucc_ec_rocm_get_attr(ucc_ec_attr_t *ec_attr)
return UCC_OK;
}

ucc_status_t ucc_ec_rocm_task_post(void *ee_stream, void **ee_req)
{
ucc_ec_rocm_config_t *cfg = EC_ROCM_CONFIG;
ucc_ec_rocm_stream_request_t *req;
ucc_ec_rocm_event_t *rocm_event;
ucc_status_t status = UCC_OK;

UCC_EC_ROCM_INIT_STREAM();
req = ucc_mpool_get(&ucc_ec_rocm.strm_reqs);
if (ucc_unlikely(!req)) {
ec_error(&ucc_ec_rocm.super, "Failed to allocate stream request");
return UCC_ERR_NO_MEMORY;
}
req->status = UCC_EC_ROCM_TASK_POSTED;
req->stream = (hipStream_t)ee_stream;

if (ucc_ec_rocm.task_strm_type == UCC_EC_ROCM_USER_STREAM) {
status = ucc_ec_rocm.post_strm_task(req->dev_status,
cfg->stream_blocking_wait,
req->stream);
if (status != UCC_OK) {
goto free_req;
}
} else {
rocm_event = ucc_mpool_get(&ucc_ec_rocm.events);
if (ucc_unlikely(!rocm_event)) {
ec_error(&ucc_ec_rocm.super, "Failed to allocate rocm event");
status = UCC_ERR_NO_MEMORY;
goto free_req;
}
ROCMCHECK(hipEventRecord(rocm_event->event, req->stream));
ROCMCHECK(hipStreamWaitEvent(ucc_ec_rocm.stream, rocm_event->event, 0));
status = ucc_ec_rocm.post_strm_task(req->dev_status,
cfg->stream_blocking_wait,
ucc_ec_rocm.stream);
if (ucc_unlikely(status != UCC_OK)) {
goto free_event;
}
ROCMCHECK(hipEventRecord(rocm_event->event, ucc_ec_rocm.stream));
ROCMCHECK(hipStreamWaitEvent(req->stream, rocm_event->event, 0));
ucc_mpool_put(rocm_event);
}

*ee_req = (void *) req;

ec_info(&ucc_ec_rocm.super, "ROCM stream task posted on \"%s\" stream. req:%p",
task_stream_types[ucc_ec_rocm.task_strm_type], req);

return UCC_OK;

free_event:
ucc_mpool_put(rocm_event);
free_req:
ucc_mpool_put(req);
return status;
}

ucc_status_t ucc_ec_rocm_task_query(void *ee_req)
{
ucc_ec_rocm_stream_request_t *req = ee_req;

/* ee task might be only in POSTED, STARTED or COMPLETED_ACK state
COMPLETED state is used by ucc_ee_rocm_task_end function to request
stream unblock*/
ucc_assert(req->status != UCC_EC_ROCM_TASK_COMPLETED);
if (req->status == UCC_EC_ROCM_TASK_POSTED) {
return UCC_INPROGRESS;
}
ec_info(&ucc_ec_rocm.super, "ROCM stream task started. req:%p", req);
return UCC_OK;
}

ucc_status_t ucc_ec_rocm_task_end(void *ee_req)
{
ucc_ec_rocm_stream_request_t *req = ee_req;
volatile ucc_ec_task_status_t *st = &req->status;

/* can be safely ended only if it's in STARTED or COMPLETED_ACK state */
ucc_assert((*st != UCC_EC_ROCM_TASK_POSTED) &&
(*st != UCC_EC_ROCM_TASK_COMPLETED));
if (*st == UCC_EC_ROCM_TASK_STARTED) {
*st = UCC_EC_ROCM_TASK_COMPLETED;
while(*st != UCC_EC_ROCM_TASK_COMPLETED_ACK) { }
}
ucc_mpool_put(req);
ec_info(&ucc_ec_rocm.super, "ROCM stream task done. req:%p", req);
return UCC_OK;
}

ucc_status_t ucc_ec_rocm_event_create(void **event)
{
ucc_ec_rocm_event_t *rocm_event;
Expand Down Expand Up @@ -512,7 +284,6 @@ static ucc_status_t ucc_ec_rocm_finalize()
}

ucc_mpool_cleanup(&ucc_ec_rocm.events, 1);
ucc_mpool_cleanup(&ucc_ec_rocm.strm_reqs, 1);
ucc_mpool_cleanup(&ucc_ec_rocm.executors, 1);
ucc_free(ucc_ec_rocm.exec_streams);
return UCC_OK;
Expand All @@ -532,9 +303,6 @@ ucc_ec_rocm_t ucc_ec_rocm = {
.table = ucc_ec_rocm_config_table,
.size = sizeof(ucc_ec_rocm_config_t),
},
.super.ops.task_post = ucc_ec_rocm_task_post,
.super.ops.task_query = ucc_ec_rocm_task_query,
.super.ops.task_end = ucc_ec_rocm_task_end,
.super.ops.create_event = ucc_ec_rocm_event_create,
.super.ops.destroy_event = ucc_ec_rocm_event_destroy,
.super.ops.event_post = ucc_ec_rocm_event_post,
Expand Down
19 changes: 0 additions & 19 deletions src/components/ec/rocm/ec_rocm.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,6 @@
#include <hip_fp16.h>
#include <hip_bfloat16.h>

typedef enum ucc_ec_rocm_strm_task_mode {
UCC_EC_ROCM_TASK_KERNEL,
UCC_EC_ROCM_TASK_MEM_OPS,
UCC_EC_ROCM_TASK_AUTO,
UCC_EC_ROCM_TASK_LAST,
} ucc_ec_rocm_strm_task_mode_t;

typedef enum ucc_ec_rocm_task_stream_type {
UCC_EC_ROCM_USER_STREAM,
UCC_EC_ROCM_INTERNAL_STREAM,
UCC_EC_ROCM_TASK_STREAM_LAST
} ucc_ec_rocm_task_stream_type_t;

typedef enum ucc_ec_task_status {
UCC_EC_ROCM_TASK_COMPLETED,
UCC_EC_ROCM_TASK_POSTED,
Expand Down Expand Up @@ -72,9 +59,6 @@ typedef ucc_status_t (*ucc_ec_rocm_task_post_fn) (uint32_t *dev_status,

typedef struct ucc_ec_rocm_config {
ucc_ec_config_t super;
ucc_ec_rocm_strm_task_mode_t strm_task_mode;
ucc_ec_rocm_task_stream_type_t task_strm_type;
int stream_blocking_wait;
unsigned long exec_num_workers;
unsigned long exec_num_threads;
unsigned long exec_max_tasks;
Expand All @@ -96,9 +80,6 @@ typedef struct ucc_ec_rocm {
ucc_mpool_t executors;
ucc_mpool_t executor_interruptible_tasks;
ucc_thread_mode_t thread_mode;
ucc_ec_rocm_strm_task_mode_t strm_task_mode;
ucc_ec_rocm_task_stream_type_t task_strm_type;
ucc_ec_rocm_task_post_fn post_strm_task;
ucc_spinlock_t init_spinlock;
ucc_ee_executor_t *cpu_executor;
} ucc_ec_rocm_t;
Expand Down
7 changes: 3 additions & 4 deletions src/components/ec/rocm/kernel/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,14 @@ HIPCCFLAGS = \
LINK = $(LIBTOOL) --mode=link $(CC) -o $@

.cu.o:
$(HIPCC) -c $< -o $@ $(HIPCCFLAGS)
$(HIPCC) -c $< -o $@ $(HIPCCFLAGS)

.cu.lo:
/bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c $< $(HIPCCFLAGS)
/bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c $< $(HIPCCFLAGS)

comp_noinst = libucc_ec_rocm_kernels.la

libucc_ec_rocm_kernels_la_SOURCES = ec_rocm_wait_kernel.cu \
ec_rocm_executor_kernel.cu \
libucc_ec_rocm_kernels_la_SOURCES = ec_rocm_executor_kernel.cu \
ec_rocm_reduce.cu
libucc_ec_rocm_kernels_la_CPPFLAGS =

Expand Down
Loading

0 comments on commit 40a1c79

Please sign in to comment.