From 424c75242adb77ebbc84960c0091eac870086f20 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Sun, 24 Mar 2024 12:04:25 +0800 Subject: [PATCH] [SYCL] offload op (#6217) * remove no USM methods * leave the schedule to ggml_backend_sched entirely --- ggml-sycl.cpp | 293 ++++---------------------------------------------- ggml-sycl.h | 16 ++- ggml.c | 10 -- llama.cpp | 36 +++---- 4 files changed, 51 insertions(+), 304 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cc9ee0762b903e..fc4d2964ccac96 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -740,11 +740,7 @@ namespace dpct sycl::queue &default_queue() { -#ifdef DPCT_USM_LEVEL_NONE - return out_of_order_queue(); -#else return in_order_queue(); -#endif // DPCT_USM_LEVEL_NONE } void queues_wait_and_throw() @@ -763,11 +759,7 @@ namespace dpct sycl::queue *create_queue(bool enable_exception_handler = false) { -#ifdef DPCT_USM_LEVEL_NONE - return create_out_of_order_queue(enable_exception_handler); -#else return create_in_order_queue(enable_exception_handler); -#endif // DPCT_USM_LEVEL_NONE } sycl::queue *create_queue(sycl::context context, sycl::device device, @@ -1075,11 +1067,6 @@ namespace dpct static pointer_access_attribute get_pointer_attribute(sycl::queue &q, const void *ptr) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().is_device_ptr(ptr) - ? pointer_access_attribute::device_only - : pointer_access_attribute::host_only; -#else switch (sycl::get_pointer_type(ptr, q.get_context())) { case sycl::usm::alloc::unknown: @@ -1090,7 +1077,6 @@ namespace dpct case sycl::usm::alloc::host: return pointer_access_attribute::host_device; } -#endif } template @@ -1273,11 +1259,7 @@ namespace dpct static inline void *dpct_malloc(size_t size, sycl::queue &q) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().mem_alloc(size * sizeof(byte_t)); -#else return sycl::malloc_device(size, q.get_device(), q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) @@ -1301,25 +1283,7 @@ namespace dpct static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, valueT value, size_t size) { -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - assert(mm.is_device_ptr(dev_ptr)); - auto alloc = mm.translate_ptr(dev_ptr); - size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr; - - return q.submit([&](sycl::handler &cgh) - { - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - auto new_buffer = alloc.buffer.reinterpret( - sycl::range<1>(alloc.size / sizeof(valueT))); - sycl::accessor - acc(new_buffer, cgh, r, o); - cgh.fill(acc, value); }); -#else return q.fill(dev_ptr, value, size); -#endif // DPCT_USM_LEVEL_NONE } /** @@ -1413,72 +1377,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -1618,45 +1518,15 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); -#endif - break; + event_list.push_back(q.submit([&](sycl::handler &cgh){ + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); } @@ -1754,11 +1624,7 @@ namespace dpct { if (ptr) { -#ifdef DPCT_USM_LEVEL_NONE - detail::mem_mgr::instance().mem_free(ptr); -#else sycl::free(ptr, q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } } @@ -1766,11 +1632,7 @@ namespace dpct inline auto get_memory(const void *x) { T *new_x = reinterpret_cast(const_cast(x)); -#ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer>(new_x); -#else return new_x; -#endif } template @@ -2222,72 +2084,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -2427,34 +2225,6 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else event_list.push_back(q.submit([&](sycl::handler &cgh) { cgh.depends_on(dep_events); @@ -2464,7 +2234,6 @@ namespace dpct to_surface[get_offset(id, to_slice, to_range.get(0))] = from_surface[get_offset(id, from_slice, from_range.get(0))]; }); })); -#endif break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); @@ -2655,9 +2424,6 @@ namespace dpct void *c[], library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type) { -#ifdef DPCT_USM_LEVEL_NONE - throw std::runtime_error("this API is unsupported when USM level is none"); -#else if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { @@ -2792,7 +2558,6 @@ namespace dpct default: throw std::runtime_error("the combination of data type is unsupported"); } -#endif } /// Computes a batch of matrix-matrix product with general matrices. @@ -3131,24 +2896,9 @@ namespace dpct template typename std::enable_if::type &operator[](size_t index) { init(); - #ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer::type>( - _device_ptr) - .template get_access()[index]; - #else return _device_ptr[index]; - #endif // DPCT_USM_LEVEL_NONE } - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - return get_buffer(_device_ptr) - .template reinterpret(_range) - .template get_access::mode, - detail::memory_traits::target>(cgh); - } - #else /// Get dpct::accessor with dimension info for the device memory object /// when usm is used and dimension is greater than 1. template @@ -3156,7 +2906,6 @@ namespace dpct get_access(sycl::handler &cgh) { return dpct_accessor_t((T *)_device_ptr, _range); } - #endif // DPCT_USM_LEVEL_NONE private: device_memory(value_t *memory_ptr, size_t size) @@ -3201,15 +2950,6 @@ namespace dpct /// Default constructor device_memory() : base(1) {} - - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - auto buf = get_buffer(base::get_ptr()) - .template reinterpret(sycl::range<1>(1)); - return accessor_t(buf, cgh); - } - #endif // DPCT_USM_LEVEL_NONE }; } // namespace detail @@ -13181,7 +12921,7 @@ int get_work_group_size(int user_device_id) { return prop.get_max_work_group_size(); } -void ggml_init_sycl() try { +static void ggml_init_sycl() try { static bool initialized = false; if (!initialized) { @@ -16677,6 +16417,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + ggml_init_sycl(); if (device_index>=g_device_count or device_index<0) { printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", device_index, g_device_count-1); @@ -17046,6 +16787,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { + ggml_init_sycl(); // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -17379,6 +17121,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + GGML_UNUSED(backend); +} + + static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, /* .free = */ ggml_backend_sycl_free, @@ -17392,7 +17141,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, - /* .offload_op = */ NULL, + /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -17406,7 +17155,7 @@ static ggml_guid_t ggml_backend_sycl_guid() { } GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { - ggml_init_sycl(); // TODO: remove from ggml.c + ggml_init_sycl(); check_allow_gpu_index(device); diff --git a/ggml-sycl.h b/ggml-sycl.h index 1c9d52115222b7..a9f776fc1dd597 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -16,16 +16,22 @@ extern "C" { #define GGML_SYCL_MAX_DEVICES 48 #define GGML_SYCL_NAME "SYCL" -GGML_API void ggml_init_sycl(void); -GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +// backend API GGML_API ggml_backend_t ggml_backend_sycl_init(int device); + +// devide buffer GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); + GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); @@ -34,6 +40,10 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); + +// SYCL doesn't support registering host memory, keep here for reference +// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index 54365b7ae26c61..18f10a3dc2f75e 100644 --- a/ggml.c +++ b/ggml.c @@ -291,8 +291,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #include "ggml-opencl.h" #elif defined(GGML_USE_VULKAN) #include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -#include "ggml-sycl.h" #endif // floating point type used to accumulate sums @@ -2698,8 +2696,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #elif defined(GGML_USE_VULKAN) ggml_vk_init_cpu_assist(); -#elif defined(GGML_USE_SYCL) - ggml_init_sycl(); #endif ggml_setup_op_has_task_pass(); @@ -16115,12 +16111,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #endif // GGML_USE_VULKAN -#ifdef GGML_USE_SYCL - bool skip_cpu = ggml_sycl_compute_forward(params, tensor); - if (skip_cpu) { - return; - } -#endif // GGML_USE_SYCL switch (tensor->op) { case GGML_OP_DUP: { diff --git a/llama.cpp b/llama.cpp index 9614cdb17400fd..61587cb7abf5a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13632,30 +13632,28 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + if (backend == nullptr) { + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { + ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU - for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - ggml_backend_t backend = ggml_backend_sycl_init(i); - if (backend == nullptr) { - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_KOMPUTE)