From a2d48eebc5c521a00fd166a098ff4e8a90803d81 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Mon, 16 Mar 2020 23:37:21 +0900 Subject: [PATCH 01/10] [TOPI][OP] Use Thrust sort for argsort and topk The current GPU sort implementation (odd-even transposition sort) is too slow when the number of elements is large. This PR introduces Thrust implementation of sort which is much faster. Note that this change requires CMake 3.8 or later since we have to use nvcc to compile a thrust code. --- CMakeLists.txt | 8 +- cmake/modules/CUDA.cmake | 6 ++ python/tvm/relay/op/strategy/cuda.py | 11 +++ src/runtime/contrib/thrust/thrust.cu | 132 +++++++++++++++++++++++++++ topi/python/topi/cuda/sort.py | 117 +++++++++++++++++++++++- 5 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 src/runtime/contrib/thrust/thrust.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index db63903d4395..57f93aa9e877 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ -cmake_minimum_required(VERSION 3.2) -project(tvm C CXX) +cmake_minimum_required(VERSION 3.8) +project(tvm C CXX CUDA) # Utility functions include(cmake/util/Util.cmake) @@ -55,6 +55,7 @@ tvm_option(USE_MKL_PATH "MKL root path when use MKL blas" none) tvm_option(USE_MKLDNN "Build with MKLDNN" OFF) tvm_option(USE_CUDNN "Build with cuDNN" OFF) tvm_option(USE_CUBLAS "Build with cuBLAS" OFF) +tvm_option(USE_THRUST "Build with Thrust" OFF) tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF) tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF) tvm_option(USE_SORT "Build with sort support" OFF) @@ -101,9 +102,11 @@ else(MSVC) message("Build in Debug mode") set(CMAKE_C_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_FLAGS "-O0 -g -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}") else() set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_FLAGS "-O2 -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}") if (HIDE_PRIVATE_SYMBOLS) message(STATUS "Hide private symbols...") set(CMAKE_C_FLAGS "-fvisibility=hidden ${CMAKE_C_FLAGS}") @@ -262,6 +265,7 @@ if(NOT MSVC) check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) message(STATUS "Build with c++14") set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_STANDARD 14) endif() add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS}) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 250a65c591e1..d7f02d582c15 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -55,6 +55,12 @@ if(USE_CUDA) endif() endif(USE_CUBLAS) + if(USE_THRUST) + message(STATUS "Build with Thrust support") + file(GLOB CONTRIB_THRUST_SRC src/runtime/contrib/thrust/*.cu) + list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) + endif(USE_THRUST) + else(USE_CUDA) list(APPEND COMPILER_SRCS src/target/opt/build_cuda_off.cc) endif(USE_CUDA) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index e5eff1c6b790..8ccd6bf51508 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -20,6 +20,7 @@ from tvm.te import SpecializedCondition from .generic import * from .. import op as _op +from .... import get_global_func @schedule_injective.register(["cuda", "gpu"]) def schedule_injective_cuda(attrs, outs, target): @@ -328,6 +329,11 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target): wrap_compute_argsort(topi.cuda.argsort), wrap_topi_schedule(topi.cuda.schedule_argsort), name="argsort.cuda") + if get_global_func("tvm.contrib.thrust.sort", allow_missing=True): + strategy.add_implementation(wrap_compute_argsort(topi.cuda.argsort_thrust), + wrap_topi_schedule(topi.cuda.schedule_argsort), + name="argsort_thrust.cuda", + plevel=15) return strategy @topk_strategy.register(["cuda", "gpu"]) @@ -337,6 +343,11 @@ def topk_strategy_cuda(attrs, inputs, out_type, target): strategy.add_implementation(wrap_compute_topk(topi.cuda.topk), wrap_topi_schedule(topi.cuda.schedule_topk), name="topk.cuda") + if get_global_func("tvm.contrib.thrust.sort", allow_missing=True): + strategy.add_implementation(wrap_compute_topk(topi.cuda.topk_thrust), + wrap_topi_schedule(topi.cuda.schedule_topk), + name="topk_thrust.cuda", + plevel=15) return strategy @multibox_prior_strategy.register(["cuda", "gpu"]) diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu new file mode 100644 index 000000000000..80c4fa5297ce --- /dev/null +++ b/src/runtime/contrib/thrust/thrust.cu @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file Use external Thrust library call + */ + +#include + +#include +#include +#include +#include + +namespace tvm { +namespace contrib { + +using namespace runtime; + +// Performs sorting along axis -1 and returns both sorted values and indices. +template +void thrust_sort(DLTensor* input, + DLTensor* out_values, + DLTensor* out_indices, + bool is_ascend) { + thrust::device_ptr data_ptr(static_cast(input->data)); + thrust::device_ptr values_ptr(static_cast(out_values->data)); + thrust::device_ptr indices_ptr(static_cast(out_indices->data)); + + int n_values = input->shape[input->ndim - 1]; + int n_iter = 1; + for (int i = 0; i < input->ndim - 1; ++i) { + n_iter *= input->shape[i]; + } + + thrust::copy(data_ptr, data_ptr + n_iter * n_values, values_ptr); + + for (int i = 0 ; i < n_iter; ++i) { + thrust::sequence(indices_ptr, indices_ptr + n_values); + if (is_ascend) { + thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr); + } else { + thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr, + thrust::greater()); + } + values_ptr += n_values; + indices_ptr += n_values; + } +} + +TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort") +.set_body([](TVMArgs args, TVMRetValue* ret) { + CHECK_GE(args.num_args, 4); + DLTensor* input = args[0]; + DLTensor* values_out = args[1]; + DLTensor* indices_out = args[2]; + bool is_ascend = args[3]; + + auto data_dtype = DLDataType2String(input->dtype); + auto out_dtype = DLDataType2String(indices_out->dtype); + + if (data_dtype == "float32") { + if (out_dtype == "int32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "int64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + } + } else if (data_dtype == "float64") { + if (out_dtype == "int32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "int64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + } + } else if (data_dtype == "int32") { + if (out_dtype == "int32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "int64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + } + } else if (data_dtype == "int64") { + if (out_dtype == "int32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "int64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float32") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else if (out_dtype == "float64") { + thrust_sort(input, values_out, indices_out, is_ascend); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + } + } else { + LOG(FATAL) << "Unsupported input dtype: " << data_dtype; + } +}); + +} // namespace contrib +} // namespace tvm diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py index f9e535e133fa..4b3a304ca77c 100644 --- a/topi/python/topi/cuda/sort.py +++ b/topi/python/topi/cuda/sort.py @@ -21,7 +21,7 @@ from .injective import schedule_injective_from_existing from ..math import identity -from ..transform import strided_slice +from ..transform import strided_slice, transpose from .. import tag def _schedule_sort(outs): @@ -291,6 +291,40 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): tag="argsort_gpu")[1] return out +def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): + """Performs sorting along the given axis and returns an array of indicies + having same shape as an input array that index data in sorted order. + + Parameters + ---------- + data: tvm.te.Tensor + The input array. + + valid_count : tvm.te.Tensor, optional + The number of valid elements to be sorted. + + axis : int, optional + Axis long which to sort the input tensor. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + DType of the output indices. + + Returns + ------- + out : tvm.te.Tensor + The output of this function. + """ + if valid_count is not None: + # TODO: implement argsort_nms with Thrust + out = argsort(data, valid_count, axis, is_ascend, dtype) + else: + out = topk_thrust(data, 0, axis, "indices", is_ascend, dtype) + return out + + def schedule_argsort(outs): """Schedule for argsort operator. @@ -384,6 +418,87 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): return output +def topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): + """Get the top k elements in an input tensor along the given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input tensor. + + k : int, optional + Number of top elements to select. Return all elements if k < 1. + + axis : int, optional + Axis long which to sort the input tensor. + + ret_type: str, optional + The return type [both, values, indices]. + "both": return both top k data and indices. + "values": return top k data only. + "indices": return top k indices only. + + is_ascend : boolean, optional + Whether to sort in ascending or descending order. + + dtype : string, optional + The data type of the indices output. + + Returns + ------- + out : tvm.te.Tensor or List[tvm.te.Tensor] + The computed result. + """ + assert ret_type in ["both", "values", "indices"] + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + + def swap(arr): + """ swap arr[axis] and arr[-1] """ + return arr[:axis] + [arr[-1]] + arr[axis+1:-1] + [arr[axis]] + + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim))) + data = transpose(data, axes) + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + out_bufs = [ + tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8), + tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8) + ] + + output = te.extern([data.shape, data.shape], + [data], + lambda ins, outs: tvm.tir.call_packed( + "tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend), + in_buffers=[data_buf], + out_buffers=out_bufs, + name="topk_gpu", + tag="topk_gpu") + + if k > 0: + beg = [0] * ndim + end = data.shape[:-1] + [k] + + values_out, indices_out = output + values_out = strided_slice(values_out, beg, end) + indices_out = strided_slice(indices_out, beg, end) + output = [values_out, indices_out] + + if axis != ndim - 1: + axes = swap(list(range(ndim))) + output[0] = transpose(output[0], axes) + output[1] = transpose(output[1], axes) + + if ret_type == "values": + output = output[0] + elif ret_type == "indices": + output = output[1] + + return output + + def schedule_topk(outs): """Schedule for argsort operator. From 4823e93a736c84eb5ca55d93a6694168ca499732 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 19:43:00 +0900 Subject: [PATCH 02/10] cmake: make CUDA optional --- CMakeLists.txt | 2 +- cmake/modules/CUDA.cmake | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57f93aa9e877..4d09bdb65132 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.8) -project(tvm C CXX CUDA) +project(tvm C CXX) # Utility functions include(cmake/util/Util.cmake) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index d7f02d582c15..0c8438293475 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -57,6 +57,7 @@ if(USE_CUDA) if(USE_THRUST) message(STATUS "Build with Thrust support") + enable_language(CUDA) file(GLOB CONTRIB_THRUST_SRC src/runtime/contrib/thrust/*.cu) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) endif(USE_THRUST) From 98b2aa3c022be6a6bb098280f76a581ee0e9785f Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 19:57:40 +0900 Subject: [PATCH 03/10] allow .cu file to be into the repository --- tests/lint/check_file_type.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 42cdff714534..a2bb260e27ad 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -42,6 +42,7 @@ "pxi", "pyd", "pyx", + "cu", # relay text format "rly", # configurations From 13b3edcdeb17b1c420042de94223154a70334ba0 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 20:06:05 +0900 Subject: [PATCH 04/10] pylint fix and cleanup --- topi/python/topi/cuda/sort.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py index 4b3a304ca77c..5499683f9d23 100644 --- a/topi/python/topi/cuda/sort.py +++ b/topi/python/topi/cuda/sort.py @@ -468,7 +468,7 @@ def swap(arr): tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8) ] - output = te.extern([data.shape, data.shape], + out = te.extern([data.shape, data.shape], [data], lambda ins, outs: tvm.tir.call_packed( "tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend), @@ -480,23 +480,18 @@ def swap(arr): if k > 0: beg = [0] * ndim end = data.shape[:-1] + [k] - - values_out, indices_out = output - values_out = strided_slice(values_out, beg, end) - indices_out = strided_slice(indices_out, beg, end) - output = [values_out, indices_out] + out = [strided_slice(o, beg, end) for o in out] if axis != ndim - 1: axes = swap(list(range(ndim))) - output[0] = transpose(output[0], axes) - output[1] = transpose(output[1], axes) + out = [transpose(o, axes) for o in out] if ret_type == "values": - output = output[0] + out = out[0] elif ret_type == "indices": - output = output[1] + out = out[1] - return output + return out def schedule_topk(outs): From addba76532611239ead06f5c73f76df4240a98d7 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 22:30:20 +0900 Subject: [PATCH 05/10] require cmake 3.8 only when thrust is enabled --- CMakeLists.txt | 2 +- cmake/modules/CUDA.cmake | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d09bdb65132..74288029d020 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.8) +cmake_minimum_required(VERSION 3.2) project(tvm C CXX) # Utility functions diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 0c8438293475..27af78d9ebfc 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -57,6 +57,7 @@ if(USE_CUDA) if(USE_THRUST) message(STATUS "Build with Thrust support") + cmake_minimum_required(VERSION 3.8) # to compile CUDA code enable_language(CUDA) file(GLOB CONTRIB_THRUST_SRC src/runtime/contrib/thrust/*.cu) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) From 041860e42f81fe8b5791b073131129241fbb78c6 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 20:19:22 +0000 Subject: [PATCH 06/10] fix nvcc compiler error when passing -pthread --- cmake/modules/CUDA.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 27af78d9ebfc..936bb681b7ff 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -57,7 +57,7 @@ if(USE_CUDA) if(USE_THRUST) message(STATUS "Build with Thrust support") - cmake_minimum_required(VERSION 3.8) # to compile CUDA code + cmake_minimum_required(VERSION 3.13) # to compile CUDA code enable_language(CUDA) file(GLOB CONTRIB_THRUST_SRC src/runtime/contrib/thrust/*.cu) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) From 0d88876a2ec25b3039cb1e040b94f76a84aad2ee Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 19 Mar 2020 20:20:34 +0000 Subject: [PATCH 07/10] add missing include --- src/runtime/contrib/thrust/thrust.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu index 80c4fa5297ce..fc9deacd371b 100644 --- a/src/runtime/contrib/thrust/thrust.cu +++ b/src/runtime/contrib/thrust/thrust.cu @@ -21,6 +21,7 @@ * \file Use external Thrust library call */ +#include #include #include From b1e388dfe674ace9874ce4202251385ae4e698d1 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 20 Mar 2020 14:58:05 +0900 Subject: [PATCH 08/10] add USE_THRUST option in config.cmake --- cmake/config.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/config.cmake b/cmake/config.cmake index 7e40f0d17869..fd295aa35b34 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -201,3 +201,6 @@ set(USE_VTA_FPGA OFF) # Whether to build the example external runtime module set(USE_EXAMPLE_EXT_RUNTIME OFF) + +# Whether use Thrust +set(USE_THRUST OFF) From bc06ce4459e756cb69f5686dce2b346cb0283e0c Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 20 Mar 2020 15:47:47 +0900 Subject: [PATCH 09/10] retrigger CI From cc59cbfd5e34fc41fe00985899c95550aea04925 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 20 Mar 2020 20:32:02 +0900 Subject: [PATCH 10/10] retrigger CI