From 9594f1903de158fc7422ce907836535da1dc58ef Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 13:49:42 +0100 Subject: [PATCH 01/49] update OpenBLAS version to support new architectures --- install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.py b/install.py index 49b1ecce8..ae624ad58 100755 --- a/install.py +++ b/install.py @@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose): git_clone( temp_dir, url="https://github.com/xianyi/OpenBLAS.git", - tag="v0.3.15", + tag="v0.3.19", verbose=verbose, ) # We can just build this directly From 69fbf7dc3115622357ae545b4de90bd78b3de053 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 14:10:38 +0100 Subject: [PATCH 02/49] initial draft for sort, 1D, key sort --- cunumeric/array.py | 36 ++++++- cunumeric/config.py | 1 + cunumeric/deferred.py | 8 ++ cunumeric/eager.py | 7 ++ cunumeric/lazy.py | 2 +- cunumeric/module.py | 31 +++++++ src/cunumeric.mk | 3 + src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/sort/sort.cc | 134 +++++++++++++++++++++++++++ src/cunumeric/sort/sort.cu | 57 ++++++++++++ src/cunumeric/sort/sort.h | 68 ++++++++++++++ src/cunumeric/sort/sort_omp.cc | 49 ++++++++++ src/cunumeric/sort/sort_template.inl | 59 ++++++++++++ tests/sort.py | 44 +++++++++ 14 files changed, 498 insertions(+), 2 deletions(-) create mode 100644 src/cunumeric/sort/sort.cc create mode 100644 src/cunumeric/sort/sort.cu create mode 100644 src/cunumeric/sort/sort.h create mode 100644 src/cunumeric/sort/sort_omp.cc create mode 100644 src/cunumeric/sort/sort_template.inl create mode 100644 tests/sort.py diff --git a/cunumeric/array.py b/cunumeric/array.py index d7cbe985b..ead21a975 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1474,7 +1474,41 @@ def setfield(self, val, dtype, offset=0): ) def setflags(self, write=None, align=None, uic=None): - self.__array__().setflags(write=write, align=align, uic=uic) + self.__array__(stacklevel=2).setflags( + write=write, align=align, uic=uic + ) + + def sort(self, axis=-1, kind="stable", order=None): + if kind != "stable": + runtime.warn( + "cuNumeric uses a different (stable) algorithm than " + + str(kind) + + " for sorting", + category=RuntimeWarning, + stacklevel=2, + ) + if order is not None: + raise NotImplementedError( + "cuNumeric does not support sorting with 'order' as " + "ndarray only supports numeric values" + ) + if axis >= self.ndim or axis < -self.ndim: + raise ValueError("invalid axis") + + if self._thunk.scalar: + # nothing to do + return + elif self.ndim == 1: + # this is the default -- sorting of 1D array + self._thunk.sort(axis=axis) + return + else: + raise NotImplementedError( + "cuNumeric only supports sorting 1D arrays at the moment" + ) + + # no return value + return def squeeze(self, axis=None): if axis is not None: diff --git a/cunumeric/config.py b/cunumeric/config.py index 1bd6fd198..76e1b97a7 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -100,6 +100,7 @@ class CuNumericOpCode(IntEnum): RAND = _cunumeric.CUNUMERIC_RAND READ = _cunumeric.CUNUMERIC_READ SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED + SORT = _cunumeric.CUNUMERIC_SORT SYRK = _cunumeric.CUNUMERIC_SYRK TILE = _cunumeric.CUNUMERIC_TILE TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index efb451175..7625eea6f 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1517,3 +1517,11 @@ def cholesky(self, src, no_tril=False): cholesky(self, src) if not no_tril: self.trilu(self, 0, True) + + def sort(self, axis=-1, kind="stable", order=None): + # TODO support axis parameter + self.runtime.legate_runtime.issue_execution_fence(block=True) + task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) + task.execute() + self.runtime.legate_runtime.issue_execution_fence(block=True) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index fc297b085..e629c5d93 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -502,6 +502,13 @@ def nonzero(self): result += (EagerArray(self.runtime, array),) return result + def sort(self, axis=-1, kind="stable", order=None): + self.check_eager_args(axis, kind, order) + if self.deferred is not None: + self.deferred.sort(axis, kind, order) + else: + self.array.sort(axis, kind, order) + def random_uniform(self): if self.deferred is not None: self.deferred.random_uniform() diff --git a/cunumeric/lazy.py b/cunumeric/lazy.py index 44eb1b0ea..90b63a842 100644 --- a/cunumeric/lazy.py +++ b/cunumeric/lazy.py @@ -128,7 +128,7 @@ def bincount(self, rhs, stacklevel, weights=None): def nonzero(self, stacklevel): raise NotImplementedError("Implement in derived classes") - def sort(self, rhs, stacklevel): + def sort(self, axis, kind, order): raise NotImplementedError("Implement in derived classes") def random_uniform(self, stacklevel): diff --git a/cunumeric/module.py b/cunumeric/module.py index db229f9bd..87455716f 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -1802,6 +1802,37 @@ def where(a, x=None, y=None): return ndarray.perform_where(a, x, y) +# Sorting + + +def argsort(a, axis=-1, kind="stable", order=None): + array = ndarray.convert_to_cunumeric_ndarray(a) + return array.argsort(axis=axis, kind=kind, order=order) + + +def lexsort(a, axis=-1): + raise NotImplementedError("Not yet implemented") + + +def msort(a): + return sort(a) + + +def sort(a, axis=-1, kind="stable", order=None): + array = ndarray.convert_to_cunumeric_ndarray(a) + out = array.copy() + out_array = ndarray.convert_to_cunumeric_ndarray(out) + out_array._thunk.sort(axis=axis, kind=kind, order=order) + return out_array + + +def sort_complex(a): + return sort(a) + + +# Counting + + @add_boilerplate("a") def count_nonzero(a, axis=None): if a.size == 0: diff --git a/src/cunumeric.mk b/src/cunumeric.mk index 2c7bb80ca..08662c5ee 100644 --- a/src/cunumeric.mk +++ b/src/cunumeric.mk @@ -43,6 +43,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc \ cunumeric/matrix/util.cc \ cunumeric/random/rand.cc \ cunumeric/search/nonzero.cc \ + cunumeric/sort/sort.cc \ cunumeric/stat/bincount.cc \ cunumeric/convolution/convolve.cc \ cunumeric/transform/flip.cc \ @@ -76,6 +77,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc \ cunumeric/matrix/util_omp.cc \ cunumeric/random/rand_omp.cc \ cunumeric/search/nonzero_omp.cc \ + cunumeric/sort/sort_omp.cc \ cunumeric/stat/bincount_omp.cc \ cunumeric/convolution/convolve_omp.cc \ cunumeric/transform/flip_omp.cc @@ -112,6 +114,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu \ cunumeric/matrix/trsm.cu \ cunumeric/random/rand.cu \ cunumeric/search/nonzero.cu \ + cunumeric/sort/sort.cu \ cunumeric/stat/bincount.cu \ cunumeric/convolution/convolve.cu \ cunumeric/transform/flip.cu \ diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index e6cd513be..574587731 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -45,6 +45,7 @@ enum CuNumericOpCode { CUNUMERIC_RAND, CUNUMERIC_READ, CUNUMERIC_SCALAR_UNARY_RED, + CUNUMERIC_SORT, CUNUMERIC_SYRK, CUNUMERIC_TILE, CUNUMERIC_TRANSPOSE_COPY_2D, diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc new file mode 100644 index 000000000..e42535cfc --- /dev/null +++ b/src/cunumeric/sort/sort.cc @@ -0,0 +1,134 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +// general routine +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point = + // " + // << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << + // std::endl; + + std::sort(inptr, inptr + volume); + + // in case of distributed data we need to switch to sample sort + if (is_index_space) { + // create (starting) sample of (at most) domain.get_volume() equidistant values + // also enrich values with additional indexes rank & local position in order to handle + // duplicate values + size_t num_local_samples = std::min(domain.get_volume(), volume); + size_t local_rank = index_point[0]; + auto local_samples = std::make_unique[]>(num_local_samples); + for (int i = 0; i < num_local_samples; ++i) { + const size_t index = (i + 1) * volume / num_local_samples - 1; + local_samples[i].value = inptr[index]; + local_samples[i].rank = local_rank; + local_samples[i].local_id = index; + } + + // std::cout << "local samples: size = " << num_local_samples << std::endl; + // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<< + // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" << + // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank + // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl; + + // all2all those samples + // TODO broadcast package size + // TODO allocate targets + // TODO broadcast samples + size_t num_global_samples = 15; + std::unique_ptr[]> global_samples(new SampleEntry[num_global_samples]); + + // sort all samples (utilize 2nd and 3rd sort criteria as well) + std::sort(&(global_samples[0]), + &(global_samples[0]) + num_global_samples, + SampleEntryComparator()); + + // define splitters + auto splitters = std::make_unique[]>(domain.get_volume() - 1); + for (int i = 0; i < domain.get_volume() - 1; ++i) { + const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1; + splitters[i] = global_samples[index]; + } + + do { + // compute local package sizes for every process based on splitters + std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); + { + size_t range_start = 0; + size_t local_position = 0; + for (int p_index = 0; p_index < domain.get_volume(); ++p) { + while (local_position < volume && still smaller or equal) { local_position++; } + + local_partition_size[partition_index++] = local_position - range_start; + range_start = local_position; + } + } + + // communicate local package-sizes all2all + // TODO + + // evaluate distribution result?? + // TODO + + // if (good enough) break; + // TODO + break; + // else iterate/improve splitters + // TODO + + } while (true); + + // all2all accepted distribution + // package sizes should already be known + // all2all communication + // TODO + + // final merge sort of received packages + // TODO + } + } +}; + +/*static*/ void SortTask::cpu_variant(TaskContext& context) +{ + sort_template(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { SortTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu new file mode 100644 index 000000000..f76b2871c --- /dev/null +++ b/src/cunumeric/sort/sort.cu @@ -0,0 +1,57 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +#include +#include +#include + +#include "cunumeric/cuda_help.h" + +namespace cunumeric { + +using namespace Legion; + +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + thrust::sort(inptr, inptr + volume); + + // in case of distributed data we need to switch to sample sort + if (is_index_space) { + // not implemented yet + assert(false); + } + } +}; + +/*static*/ void SortTask::gpu_variant(TaskContext& context) +{ + sort_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h new file mode 100644 index 000000000..8c3f5a0df --- /dev/null +++ b/src/cunumeric/sort/sort.h @@ -0,0 +1,68 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct SortArgs { + Array& output; + bool is_index_space; + Legion::DomainPoint index_point; + Legion::Domain domain; +}; + +template +struct SampleEntry { + VAL value; + size_t rank; + size_t local_id; +}; + +template +struct SampleEntryComparator { + bool operator()(const SampleEntry& a, const SampleEntry& b) const + { + if (a.value < b.value) { + return true; + } else if (a.value == b.value) { + if (a.rank < b.rank) { + return true; + } else if (a.rank == b.rank) { + return a.local_id < b.local_id; + } + } + return false; + } +}; + +class SortTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_SORT; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc new file mode 100644 index 000000000..d8ffadbd0 --- /dev/null +++ b/src/cunumeric/sort/sort_omp.cc @@ -0,0 +1,49 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +#include + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + // not implemented yet + assert(false); + } +}; + +/*static*/ void SortTask::omp_variant(TaskContext& context) +{ + sort_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl new file mode 100644 index 000000000..5355bdbbe --- /dev/null +++ b/src/cunumeric/sort/sort_template.inl @@ -0,0 +1,59 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct SortImplBody; + +template +struct SortImpl { + template + void operator()(SortArgs& args) const + { + using VAL = legate_type_of; + + auto rect = args.output.shape(); + + Pitches pitches; + size_t volume = pitches.flatten(rect); + + // TODO -- we cannot stop! need to proceed as partition might be filled later + if (volume == 0) { return; } + + auto inout = args.output.read_write_accessor(rect); + + SortImplBody()( + inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain); + } +}; + +template +static void sort_template(TaskContext& context) +{ + SortArgs args{context.outputs()[0], + context.task_->is_index_space, + context.task_->index_point, + context.task_->index_domain}; + double_dispatch(args.output.dim(), args.output.code(), SortImpl{}, args); +} + +} // namespace cunumeric diff --git a/tests/sort.py b/tests/sort.py new file mode 100644 index 000000000..ab5c91193 --- /dev/null +++ b/tests/sort.py @@ -0,0 +1,44 @@ +# Copyright 2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np + +import cunumeric as num + + +def test(): + np.random.seed(42) + A_np = np.array(np.random.randint(10, size=30), dtype=np.int32) + + A_num = num.array(A_np) + print("Sorting array : " + str(A_np)) + + sortA_np = np.sort(A_np) + print("Result numpy : " + str(sortA_np)) + + # pdb.set_trace() + sortA_num = num.sort(A_num) + print("Result cunumeric: " + str(sortA_num)) + + A_num.sort() + print("Result (inplace): " + str(A_num)) + + assert num.allclose(sortA_np, sortA_num) + + return + + +if __name__ == "__main__": + test() From dfa7adbcca414828084a686288fa7f195aeff6b5 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 15:17:28 +0100 Subject: [PATCH 03/49] fixed compile error --- src/cunumeric/sort/sort.cc | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index e42535cfc..28cf83c23 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -84,15 +84,23 @@ struct SortImplBody { do { // compute local package sizes for every process based on splitters - std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); + std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); { size_t range_start = 0; size_t local_position = 0; - for (int p_index = 0; p_index < domain.get_volume(); ++p) { - while (local_position < volume && still smaller or equal) { local_position++; } - - local_partition_size[partition_index++] = local_position - range_start; - range_start = local_position; + for (int p_index = 0; p_index < domain.get_volume(); ++p_index) { + // move as long current value is lesser or equaöl to current splitter + while (local_position < volume && + (inptr[local_position] < splitters[p_index].value || + (inptr[local_position] == splitters[p_index].value && + (local_rank < splitters[p_index].rank || + (local_rank == splitters[p_index].rank && + local_position <= splitters[p_index].local_id))))) { + local_position++; + } + + local_partition_size[p_index++] = local_position - range_start; + range_start = local_position; } } From 4c7c3a23a13d5e06297cb82bb70430f718ee6910 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:03:50 +0100 Subject: [PATCH 04/49] OpenMP non-distributed implementation, some small fixes, benchmark tool --- examples/sort.py | 102 +++++++++++++++++++++++++++++++++ src/cunumeric/sort/sort.cc | 17 +++--- src/cunumeric/sort/sort.cu | 9 ++- src/cunumeric/sort/sort_omp.cc | 72 ++++++++++++++++++++++- tests/sort.py | 4 +- 5 files changed, 191 insertions(+), 13 deletions(-) create mode 100644 examples/sort.py diff --git a/examples/sort.py b/examples/sort.py new file mode 100644 index 000000000..9142c8a12 --- /dev/null +++ b/examples/sort.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# Copyright 2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import datetime + +import numpy +from benchmark import run_benchmark + +import cunumeric + + +def check_sorted(a, a_numpy): + a_sorted = numpy.sort(a_numpy) + print("Checking result...") + if cunumeric.allclose(a_sorted, a): + print("PASS!") + else: + print("FAIL!") + print("NUMPY : " + str(a_sorted)) + print("CUNUMERIC: " + str(a)) + + +def run_sort(N, perform_check, timing): + + numpy.random.seed(42) + a_numpy = numpy.array( + numpy.random.randint(1000, size=N), dtype=numpy.int32 + ) + a = cunumeric.array(a_numpy) + + start = datetime.datetime.now() + a_sorted = cunumeric.sort(a) + stop = datetime.datetime.now() + + if perform_check: + check_sorted(a_sorted, a_numpy) + else: + # do we need to synchronize? + assert True + delta = stop - start + total = delta.total_seconds() * 1000.0 + if timing: + print("Elapsed Time: " + str(total) + " ms") + return total + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--check", + dest="check", + action="store_true", + help="check the result of the solve", + ) + parser.add_argument( + "-n", + "--num", + type=int, + default=1000000, + dest="N", + help="number of elements in one dimension", + ) + parser.add_argument( + "-t", + "--time", + dest="timing", + action="store_true", + help="perform timing", + ) + parser.add_argument( + "-b", + "--benchmark", + type=int, + default=1, + dest="benchmark", + help="number of times to benchmark this application (default 1 - " + "normal execution)", + ) + + args = parser.parse_args() + run_benchmark( + run_sort, + args.benchmark, + "Sort", + (args.N, args.check, args.timing), + ) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 28cf83c23..af96acdba 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -35,12 +35,13 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point = - // " - // << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << - // std::endl; +#ifdef DEBUG_CUNUMERIC + std::cout << "CPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; +#endif - std::sort(inptr, inptr + volume); + std::stable_sort(inptr, inptr + volume); // in case of distributed data we need to switch to sample sort if (is_index_space) { @@ -71,9 +72,9 @@ struct SortImplBody { std::unique_ptr[]> global_samples(new SampleEntry[num_global_samples]); // sort all samples (utilize 2nd and 3rd sort criteria as well) - std::sort(&(global_samples[0]), - &(global_samples[0]) + num_global_samples, - SampleEntryComparator()); + std::stable_sort(&(global_samples[0]), + &(global_samples[0]) + num_global_samples, + SampleEntryComparator()); // define splitters auto splitters = std::make_unique[]>(domain.get_volume() - 1); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index f76b2871c..2ce1987ad 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -39,7 +39,14 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - thrust::sort(inptr, inptr + volume); +#ifdef DEBUG_CUNUMERIC + std::cout << "GPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; +#endif + + thrust::device_ptr dev_ptr(inptr); + thrust::stable_sort(dev_ptr, dev_ptr + volume); // in case of distributed data we need to switch to sample sort if (is_index_space) { diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index d8ffadbd0..1f2d00262 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -24,6 +24,51 @@ namespace cunumeric { using namespace Legion; using namespace legate; +template +void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp) +{ + const size_t mid = (end_idx + start_idx) / 2; + size_t left_idx = start_idx; + size_t right_idx = mid; + size_t target_idx = start_idx; + + while (left_idx < mid && right_idx < end_idx) { + if (inptr[left_idx] <= inptr[right_idx]) { + tmp[target_idx++] = inptr[left_idx++]; + } else { + tmp[target_idx++] = inptr[right_idx++]; + } + } + + while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; } + while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; } + + std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx); +} + +// TODO tune +#define SEQUENTIAL_THRESHOLD 1024 +#define TASK_THRESHOLD 2048 + +template +void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp) +{ + const size_t size = end_idx - start_idx + 1; + if (size > SEQUENTIAL_THRESHOLD) { + const size_t mid = (end_idx + start_idx) / 2; + +#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) + merge_sort(inptr, start_idx, mid, tmp); +#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) + merge_sort(inptr, mid, end_idx, tmp); + +#pragma omp taskwait + merge(inptr, start_idx, end_idx, tmp); + } else if (size > 1) { + std::stable_sort(inptr + start_idx, inptr + end_idx); + } +} + template struct SortImplBody { using VAL = legate_type_of; @@ -36,8 +81,31 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - // not implemented yet - assert(false); +#ifdef DEBUG_CUNUMERIC + std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested() + << "): local size = " << volume << ", dist. = " << is_index_space + << ", index_point = " << index_point << ", domain/volume = " << domain << "/" + << domain.get_volume() << std::endl; +#endif + + bool nested = omp_get_nested(); + if (!nested) omp_set_nested(1); + + // merge sort + auto tmp = std::make_unique(volume); + +#pragma omp parallel shared(inptr, tmp) + { +#pragma omp single + merge_sort(inptr, 0, volume, &(tmp[0])); + } + + if (is_index_space) { + // not implemented yet + assert(false); + } + + if (!nested) omp_set_nested(0); } }; diff --git a/tests/sort.py b/tests/sort.py index ab5c91193..b8945d19d 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -31,11 +31,11 @@ def test(): # pdb.set_trace() sortA_num = num.sort(A_num) print("Result cunumeric: " + str(sortA_num)) + assert num.allclose(sortA_np, sortA_num) A_num.sort() print("Result (inplace): " + str(A_num)) - - assert num.allclose(sortA_np, sortA_num) + assert num.allclose(sortA_np, A_num) return From 710c084590f204f47038054f34c7d9abdf1c92f4 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:10:51 +0100 Subject: [PATCH 05/49] added missing include --- src/cunumeric/sort/sort.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 2ce1987ad..5530de63d 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include "cunumeric/cuda_help.h" From 07bdb1664257eb976396acd1c93026af1db3cb6f Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:48:03 +0100 Subject: [PATCH 06/49] switch to parallel gcc sort --- src/cunumeric/sort/sort_omp.cc | 60 ++-------------------------------- 1 file changed, 2 insertions(+), 58 deletions(-) diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 1f2d00262..030d26cf0 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -17,6 +17,7 @@ #include "cunumeric/sort/sort.h" #include "cunumeric/sort/sort_template.inl" +#include #include namespace cunumeric { @@ -24,51 +25,6 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template -void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp) -{ - const size_t mid = (end_idx + start_idx) / 2; - size_t left_idx = start_idx; - size_t right_idx = mid; - size_t target_idx = start_idx; - - while (left_idx < mid && right_idx < end_idx) { - if (inptr[left_idx] <= inptr[right_idx]) { - tmp[target_idx++] = inptr[left_idx++]; - } else { - tmp[target_idx++] = inptr[right_idx++]; - } - } - - while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; } - while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; } - - std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx); -} - -// TODO tune -#define SEQUENTIAL_THRESHOLD 1024 -#define TASK_THRESHOLD 2048 - -template -void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp) -{ - const size_t size = end_idx - start_idx + 1; - if (size > SEQUENTIAL_THRESHOLD) { - const size_t mid = (end_idx + start_idx) / 2; - -#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) - merge_sort(inptr, start_idx, mid, tmp); -#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) - merge_sort(inptr, mid, end_idx, tmp); - -#pragma omp taskwait - merge(inptr, start_idx, end_idx, tmp); - } else if (size > 1) { - std::stable_sort(inptr + start_idx, inptr + end_idx); - } -} - template struct SortImplBody { using VAL = legate_type_of; @@ -88,24 +44,12 @@ struct SortImplBody { << domain.get_volume() << std::endl; #endif - bool nested = omp_get_nested(); - if (!nested) omp_set_nested(1); - - // merge sort - auto tmp = std::make_unique(volume); - -#pragma omp parallel shared(inptr, tmp) - { -#pragma omp single - merge_sort(inptr, 0, volume, &(tmp[0])); - } + __gnu_parallel::stable_sort(inptr, inptr + volume); if (is_index_space) { // not implemented yet assert(false); } - - if (!nested) omp_set_nested(0); } }; From 58b2bf44315dca6d526cd274de38d0254f44001a Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 10 Feb 2022 09:16:31 -0800 Subject: [PATCH 07/49] Enable N-D non-distributed sort --- cunumeric/array.py | 11 +--- cunumeric/deferred.py | 37 ++++++++++--- cunumeric/module.py | 8 +-- src/cunumeric/sort/sort.cc | 15 ++++-- src/cunumeric/sort/sort.cu | 7 ++- src/cunumeric/sort/sort.h | 1 + src/cunumeric/sort/sort_omp.cc | 15 +++++- src/cunumeric/sort/sort_template.inl | 31 +++++++++-- tests/sort.py | 77 +++++++++++++++++++++++++++- 9 files changed, 170 insertions(+), 32 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index ead21a975..e801010d3 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1498,16 +1498,9 @@ def sort(self, axis=-1, kind="stable", order=None): if self._thunk.scalar: # nothing to do return - elif self.ndim == 1: - # this is the default -- sorting of 1D array - self._thunk.sort(axis=axis) - return else: - raise NotImplementedError( - "cuNumeric only supports sorting 1D arrays at the moment" - ) - - # no return value + # this is the default -- sorting of N-D array + self._thunk.sort(axis=axis) return def squeeze(self, axis=None): diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 7625eea6f..7cc5499ba 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1519,9 +1519,34 @@ def cholesky(self, src, no_tril=False): self.trilu(self, 0, True) def sort(self, axis=-1, kind="stable", order=None): - # TODO support axis parameter - self.runtime.legate_runtime.issue_execution_fence(block=True) - task = self.context.create_task(CuNumericOpCode.SORT) - task.add_output(self.base) - task.execute() - self.runtime.legate_runtime.issue_execution_fence(block=True) + axis_normalized = axis + if axis_normalized < 0: + axis_normalized = self.ndim + axis + + if axis_normalized is not self.ndim - 1: + assert axis_normalized < self.ndim - 1 and axis_normalized >= 0 + + # swap axes + swapped = self.swapaxes(axis_normalized, self.ndim - 1) + + # FIXME: ensure *new* distribution does not split last axis (!) + swapped_copy = self.runtime.create_empty_thunk( + swapped.shape, dtype=self.dtype, inputs=[self, swapped] + ) + swapped_copy.copy(swapped, deep=True) + + # run sort on last axis + swapped_copy.sort(self.ndim - 1) + + self.base = swapped_copy.swapaxes( + axis_normalized, self.ndim - 1 + ).base + self.numpy_array = None + else: + # run actual sort task + self.runtime.legate_runtime.issue_execution_fence(block=True) + task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) + task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64) + task.execute() + self.runtime.legate_runtime.issue_execution_fence(block=True) diff --git a/cunumeric/module.py b/cunumeric/module.py index 87455716f..99db26846 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -1805,9 +1805,9 @@ def where(a, x=None, y=None): # Sorting +@add_boilerplate("a") def argsort(a, axis=-1, kind="stable", order=None): - array = ndarray.convert_to_cunumeric_ndarray(a) - return array.argsort(axis=axis, kind=kind, order=order) + return a.argsort(axis=axis, kind=kind, order=order) def lexsort(a, axis=-1): @@ -1818,9 +1818,9 @@ def msort(a): return sort(a) +@add_boilerplate("a") def sort(a, axis=-1, kind="stable", order=None): - array = ndarray.convert_to_cunumeric_ndarray(a) - out = array.copy() + out = a.copy() out_array = ndarray.convert_to_cunumeric_ndarray(out) out_array._thunk.sort(axis=axis, kind=kind, order=order) return out_array diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index af96acdba..4d26ce1ad 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -31,6 +31,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -39,12 +40,20 @@ struct SortImplBody { std::cout << "CPU(" << index_point[0] << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; + + if (volume <= 30) { + std::cout << "inptr = [ "; + for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; } + std::cout << "]" << std::endl; + } #endif - std::stable_sort(inptr, inptr + volume); + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } - // in case of distributed data we need to switch to sample sort - if (is_index_space) { + // in case of distributed data (1D) we need to switch to sample sort + if (is_index_space && DIM == 1) { // create (starting) sample of (at most) domain.get_volume() equidistant values // also enrich values with additional indexes rank & local position in order to handle // duplicate values diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 5530de63d..f43445a8b 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -36,6 +36,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -47,10 +48,12 @@ struct SortImplBody { #endif thrust::device_ptr dev_ptr(inptr); - thrust::stable_sort(dev_ptr, dev_ptr + volume); + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size); + } // in case of distributed data we need to switch to sample sort - if (is_index_space) { + if (is_index_space && DIM == 1) { // not implemented yet assert(false); } diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index 8c3f5a0df..febc1f57c 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -22,6 +22,7 @@ namespace cunumeric { struct SortArgs { Array& output; + size_t sort_dim_size; bool is_index_space; Legion::DomainPoint index_point; Legion::Domain domain; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 030d26cf0..6c26f07ae 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -33,6 +33,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -44,9 +45,19 @@ struct SortImplBody { << domain.get_volume() << std::endl; #endif - __gnu_parallel::stable_sort(inptr, inptr + volume); + if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune + { +#pragma omp do schedule(dynamic) + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } + } else { + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } + } - if (is_index_space) { + if (is_index_space && DIM == 1) { // not implemented yet assert(false); } diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 5355bdbbe..b1922f014 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -36,13 +36,33 @@ struct SortImpl { Pitches pitches; size_t volume = pitches.flatten(rect); - // TODO -- we cannot stop! need to proceed as partition might be filled later - if (volume == 0) { return; } - auto inout = args.output.read_write_accessor(rect); - SortImplBody()( - inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain); + /* + * Assumptions: + * 1. Sort is always requested for the 'last' dimension within rect + * 2. We have product_of_all_other_dimensions independent sort ranges + * 3. if we have more than one participants: + * a) 1D-case: we need to perform parallel sort (e.g. via sampling) + * b) ND-case: rect needs to be the full domain in that last dimension + */ + +#ifdef DEBUG_CUNUMERIC + std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size + << std::endl; + + assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && + "multi-dimensional array should not be distributed in last (sort) dimension"); +#endif + + SortImplBody()(inout.ptr(rect), + pitches, + rect, + volume, + args.sort_dim_size, + args.is_index_space, + args.index_point, + args.domain); } }; @@ -50,6 +70,7 @@ template static void sort_template(TaskContext& context) { SortArgs args{context.outputs()[0], + context.scalars()[0].value(), context.task_->is_index_space, context.task_->index_point, context.task_->index_domain}; diff --git a/tests/sort.py b/tests/sort.py index b8945d19d..bdc4c4b93 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -18,7 +18,17 @@ import cunumeric as num -def test(): +def test_sort_axis(a_np, a_num, axis): + assert num.allclose(a_np, a_num) + print("Sorting axis " + str(axis) + ":") + sort_np = np.sort(a_np, axis) + sort_num = num.sort(a_num, axis, kind="merge") + # print(sort_np) + # print(sort_num) + assert num.allclose(sort_np, sort_num) + + +def test_1D(): np.random.seed(42) A_np = np.array(np.random.randint(10, size=30), dtype=np.int32) @@ -40,5 +50,70 @@ def test(): return +def test_2D(): + np.random.seed(42) + x_dim = 5 + y_dim = 3 + A_np = np.array( + np.random.randint(10, size=x_dim * y_dim), dtype=np.int32 + ).reshape(x_dim, y_dim) + + A_num = num.array(A_np) + print("Sorting matrix:\n") + print(A_num) + + test_sort_axis(A_np, A_num, 1) + test_sort_axis(A_np, A_num, 0) + + return + + +def test_3D(): + np.random.seed(42) + x_dim = 5 + y_dim = 3 + z_dim = 7 + A_np = np.array( + np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32 + ).reshape(x_dim, y_dim, z_dim) + + A_num = num.array(A_np) + print("Sorting 3d tensor:\n") + print(A_np) + + test_sort_axis(A_np, A_num, 2) + test_sort_axis(A_np, A_num, 1) + test_sort_axis(A_np, A_num, 0) + + return + + +def test_custom(): + a = np.arange(2 * 4).reshape(2, 4) + a_transpose = np.transpose(a) + + a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]]) + a_num = num.array(a) + a_num_transposed = a_num.swapaxes(0, 1) + + test_sort_axis(a, a_num, 1) + test_sort_axis(a_transpose, a_transposed_num, 1) + test_sort_axis(a_transpose, a_num_transposed, 1) + test_sort_axis(a_transpose, a_num_transposed, 0) + + return + + +def test(): + print("\n\n ----------- Custom test ---------------\n") + test_custom() + print("\n\n ----------- 2D test ---------------\n") + test_2D() + print("\n\n ----------- 3D test ---------------\n") + test_3D() + print("\n\n ----------- 1D test ---------------\n") + test_1D() + + if __name__ == "__main__": test() From f585dd51469ad90ac1cd7c73bf721e45b2b0a21a Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 13:49:42 +0100 Subject: [PATCH 08/49] update OpenBLAS version to support new architectures --- install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.py b/install.py index fadc27a7e..379c05ed8 100755 --- a/install.py +++ b/install.py @@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose): git_clone( temp_dir, url="https://github.com/xianyi/OpenBLAS.git", - tag="v0.3.15", + tag="v0.3.19", verbose=verbose, ) # We can just build this directly From 3a08481c1f407b24e37841708d3e9b44c9c0ffd4 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 14:10:38 +0100 Subject: [PATCH 09/49] initial draft for sort, 1D, key sort --- cunumeric/array.py | 36 ++++++- cunumeric/config.py | 1 + cunumeric/deferred.py | 8 ++ cunumeric/eager.py | 7 ++ cunumeric/lazy.py | 2 +- cunumeric/module.py | 28 ++++++ src/cunumeric.mk | 3 + src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/sort/sort.cc | 134 +++++++++++++++++++++++++++ src/cunumeric/sort/sort.cu | 57 ++++++++++++ src/cunumeric/sort/sort.h | 68 ++++++++++++++ src/cunumeric/sort/sort_omp.cc | 49 ++++++++++ src/cunumeric/sort/sort_template.inl | 59 ++++++++++++ tests/sort.py | 44 +++++++++ 14 files changed, 495 insertions(+), 2 deletions(-) create mode 100644 src/cunumeric/sort/sort.cc create mode 100644 src/cunumeric/sort/sort.cu create mode 100644 src/cunumeric/sort/sort.h create mode 100644 src/cunumeric/sort/sort_omp.cc create mode 100644 src/cunumeric/sort/sort_template.inl create mode 100644 tests/sort.py diff --git a/cunumeric/array.py b/cunumeric/array.py index c214db335..0becb1712 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1476,7 +1476,41 @@ def setfield(self, val, dtype, offset=0): ) def setflags(self, write=None, align=None, uic=None): - self.__array__().setflags(write=write, align=align, uic=uic) + self.__array__(stacklevel=2).setflags( + write=write, align=align, uic=uic + ) + + def sort(self, axis=-1, kind="stable", order=None): + if kind != "stable": + runtime.warn( + "cuNumeric uses a different (stable) algorithm than " + + str(kind) + + " for sorting", + category=RuntimeWarning, + stacklevel=2, + ) + if order is not None: + raise NotImplementedError( + "cuNumeric does not support sorting with 'order' as " + "ndarray only supports numeric values" + ) + if axis >= self.ndim or axis < -self.ndim: + raise ValueError("invalid axis") + + if self._thunk.scalar: + # nothing to do + return + elif self.ndim == 1: + # this is the default -- sorting of 1D array + self._thunk.sort(axis=axis) + return + else: + raise NotImplementedError( + "cuNumeric only supports sorting 1D arrays at the moment" + ) + + # no return value + return def squeeze(self, axis=None): if axis is not None: diff --git a/cunumeric/config.py b/cunumeric/config.py index 2968c50ea..d4e942fda 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -100,6 +100,7 @@ class CuNumericOpCode(IntEnum): RAND = _cunumeric.CUNUMERIC_RAND READ = _cunumeric.CUNUMERIC_READ SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED + SORT = _cunumeric.CUNUMERIC_SORT SYRK = _cunumeric.CUNUMERIC_SYRK TILE = _cunumeric.CUNUMERIC_TILE TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 101dc184d..871f32c1a 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1517,3 +1517,11 @@ def cholesky(self, src, no_tril=False): cholesky(self, src) if not no_tril: self.trilu(self, 0, True) + + def sort(self, axis=-1, kind="stable", order=None): + # TODO support axis parameter + self.runtime.legate_runtime.issue_execution_fence(block=True) + task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) + task.execute() + self.runtime.legate_runtime.issue_execution_fence(block=True) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index e1a19b1c2..0ebdd8959 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -502,6 +502,13 @@ def nonzero(self): result += (EagerArray(self.runtime, array),) return result + def sort(self, axis=-1, kind="stable", order=None): + self.check_eager_args(axis, kind, order) + if self.deferred is not None: + self.deferred.sort(axis, kind, order) + else: + self.array.sort(axis, kind, order) + def random_uniform(self): if self.deferred is not None: self.deferred.random_uniform() diff --git a/cunumeric/lazy.py b/cunumeric/lazy.py index bca9b7103..5b861a749 100644 --- a/cunumeric/lazy.py +++ b/cunumeric/lazy.py @@ -128,7 +128,7 @@ def bincount(self, rhs, stacklevel, weights=None): def nonzero(self, stacklevel): raise NotImplementedError("Implement in derived classes") - def sort(self, rhs, stacklevel): + def sort(self, axis, kind, order): raise NotImplementedError("Implement in derived classes") def random_uniform(self, stacklevel): diff --git a/cunumeric/module.py b/cunumeric/module.py index 9237c7e6c..55998d448 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5846,3 +5846,31 @@ def bincount(a, weights=None, minlength=0): ) out._thunk.bincount(a._thunk, weights=weights._thunk) return out + +# Sorting + +def argsort(a, axis=-1, kind="stable", order=None): + array = ndarray.convert_to_cunumeric_ndarray(a) + return array.argsort(axis=axis, kind=kind, order=order) + + +def lexsort(a, axis=-1): + raise NotImplementedError("Not yet implemented") + + +def msort(a): + return sort(a) + + +def sort(a, axis=-1, kind="stable", order=None): + array = ndarray.convert_to_cunumeric_ndarray(a) + out = array.copy() + out_array = ndarray.convert_to_cunumeric_ndarray(out) + out_array._thunk.sort(axis=axis, kind=kind, order=order) + return out_array + + +def sort_complex(a): + return sort(a) + + diff --git a/src/cunumeric.mk b/src/cunumeric.mk index 20c74128a..f695ac97b 100644 --- a/src/cunumeric.mk +++ b/src/cunumeric.mk @@ -43,6 +43,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc \ cunumeric/matrix/util.cc \ cunumeric/random/rand.cc \ cunumeric/search/nonzero.cc \ + cunumeric/sort/sort.cc \ cunumeric/stat/bincount.cc \ cunumeric/convolution/convolve.cc \ cunumeric/transform/flip.cc \ @@ -76,6 +77,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc \ cunumeric/matrix/util_omp.cc \ cunumeric/random/rand_omp.cc \ cunumeric/search/nonzero_omp.cc \ + cunumeric/sort/sort_omp.cc \ cunumeric/stat/bincount_omp.cc \ cunumeric/convolution/convolve_omp.cc \ cunumeric/transform/flip_omp.cc @@ -112,6 +114,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu \ cunumeric/matrix/trsm.cu \ cunumeric/random/rand.cu \ cunumeric/search/nonzero.cu \ + cunumeric/sort/sort.cu \ cunumeric/stat/bincount.cu \ cunumeric/convolution/convolve.cu \ cunumeric/transform/flip.cu \ diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index e6612abe1..abbc13fbb 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -45,6 +45,7 @@ enum CuNumericOpCode { CUNUMERIC_RAND, CUNUMERIC_READ, CUNUMERIC_SCALAR_UNARY_RED, + CUNUMERIC_SORT, CUNUMERIC_SYRK, CUNUMERIC_TILE, CUNUMERIC_TRANSPOSE_COPY_2D, diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc new file mode 100644 index 000000000..e42535cfc --- /dev/null +++ b/src/cunumeric/sort/sort.cc @@ -0,0 +1,134 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +// general routine +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point = + // " + // << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << + // std::endl; + + std::sort(inptr, inptr + volume); + + // in case of distributed data we need to switch to sample sort + if (is_index_space) { + // create (starting) sample of (at most) domain.get_volume() equidistant values + // also enrich values with additional indexes rank & local position in order to handle + // duplicate values + size_t num_local_samples = std::min(domain.get_volume(), volume); + size_t local_rank = index_point[0]; + auto local_samples = std::make_unique[]>(num_local_samples); + for (int i = 0; i < num_local_samples; ++i) { + const size_t index = (i + 1) * volume / num_local_samples - 1; + local_samples[i].value = inptr[index]; + local_samples[i].rank = local_rank; + local_samples[i].local_id = index; + } + + // std::cout << "local samples: size = " << num_local_samples << std::endl; + // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<< + // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" << + // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank + // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl; + + // all2all those samples + // TODO broadcast package size + // TODO allocate targets + // TODO broadcast samples + size_t num_global_samples = 15; + std::unique_ptr[]> global_samples(new SampleEntry[num_global_samples]); + + // sort all samples (utilize 2nd and 3rd sort criteria as well) + std::sort(&(global_samples[0]), + &(global_samples[0]) + num_global_samples, + SampleEntryComparator()); + + // define splitters + auto splitters = std::make_unique[]>(domain.get_volume() - 1); + for (int i = 0; i < domain.get_volume() - 1; ++i) { + const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1; + splitters[i] = global_samples[index]; + } + + do { + // compute local package sizes for every process based on splitters + std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); + { + size_t range_start = 0; + size_t local_position = 0; + for (int p_index = 0; p_index < domain.get_volume(); ++p) { + while (local_position < volume && still smaller or equal) { local_position++; } + + local_partition_size[partition_index++] = local_position - range_start; + range_start = local_position; + } + } + + // communicate local package-sizes all2all + // TODO + + // evaluate distribution result?? + // TODO + + // if (good enough) break; + // TODO + break; + // else iterate/improve splitters + // TODO + + } while (true); + + // all2all accepted distribution + // package sizes should already be known + // all2all communication + // TODO + + // final merge sort of received packages + // TODO + } + } +}; + +/*static*/ void SortTask::cpu_variant(TaskContext& context) +{ + sort_template(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { SortTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu new file mode 100644 index 000000000..f76b2871c --- /dev/null +++ b/src/cunumeric/sort/sort.cu @@ -0,0 +1,57 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +#include +#include +#include + +#include "cunumeric/cuda_help.h" + +namespace cunumeric { + +using namespace Legion; + +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + thrust::sort(inptr, inptr + volume); + + // in case of distributed data we need to switch to sample sort + if (is_index_space) { + // not implemented yet + assert(false); + } + } +}; + +/*static*/ void SortTask::gpu_variant(TaskContext& context) +{ + sort_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h new file mode 100644 index 000000000..8c3f5a0df --- /dev/null +++ b/src/cunumeric/sort/sort.h @@ -0,0 +1,68 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct SortArgs { + Array& output; + bool is_index_space; + Legion::DomainPoint index_point; + Legion::Domain domain; +}; + +template +struct SampleEntry { + VAL value; + size_t rank; + size_t local_id; +}; + +template +struct SampleEntryComparator { + bool operator()(const SampleEntry& a, const SampleEntry& b) const + { + if (a.value < b.value) { + return true; + } else if (a.value == b.value) { + if (a.rank < b.rank) { + return true; + } else if (a.rank == b.rank) { + return a.local_id < b.local_id; + } + } + return false; + } +}; + +class SortTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_SORT; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc new file mode 100644 index 000000000..d8ffadbd0 --- /dev/null +++ b/src/cunumeric/sort/sort_omp.cc @@ -0,0 +1,49 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/sort/sort.h" +#include "cunumeric/sort/sort_template.inl" + +#include + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct SortImplBody { + using VAL = legate_type_of; + + void operator()(VAL* inptr, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + bool is_index_space, + Legion::DomainPoint index_point, + Legion::Domain domain) + { + // not implemented yet + assert(false); + } +}; + +/*static*/ void SortTask::omp_variant(TaskContext& context) +{ + sort_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl new file mode 100644 index 000000000..5355bdbbe --- /dev/null +++ b/src/cunumeric/sort/sort_template.inl @@ -0,0 +1,59 @@ +/* Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct SortImplBody; + +template +struct SortImpl { + template + void operator()(SortArgs& args) const + { + using VAL = legate_type_of; + + auto rect = args.output.shape(); + + Pitches pitches; + size_t volume = pitches.flatten(rect); + + // TODO -- we cannot stop! need to proceed as partition might be filled later + if (volume == 0) { return; } + + auto inout = args.output.read_write_accessor(rect); + + SortImplBody()( + inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain); + } +}; + +template +static void sort_template(TaskContext& context) +{ + SortArgs args{context.outputs()[0], + context.task_->is_index_space, + context.task_->index_point, + context.task_->index_domain}; + double_dispatch(args.output.dim(), args.output.code(), SortImpl{}, args); +} + +} // namespace cunumeric diff --git a/tests/sort.py b/tests/sort.py new file mode 100644 index 000000000..ab5c91193 --- /dev/null +++ b/tests/sort.py @@ -0,0 +1,44 @@ +# Copyright 2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np + +import cunumeric as num + + +def test(): + np.random.seed(42) + A_np = np.array(np.random.randint(10, size=30), dtype=np.int32) + + A_num = num.array(A_np) + print("Sorting array : " + str(A_np)) + + sortA_np = np.sort(A_np) + print("Result numpy : " + str(sortA_np)) + + # pdb.set_trace() + sortA_num = num.sort(A_num) + print("Result cunumeric: " + str(sortA_num)) + + A_num.sort() + print("Result (inplace): " + str(A_num)) + + assert num.allclose(sortA_np, sortA_num) + + return + + +if __name__ == "__main__": + test() From 131fb6d3fd583888207ad401427ac79fb5072c36 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 15:17:28 +0100 Subject: [PATCH 10/49] fixed compile error --- src/cunumeric/sort/sort.cc | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index e42535cfc..28cf83c23 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -84,15 +84,23 @@ struct SortImplBody { do { // compute local package sizes for every process based on splitters - std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); + std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); { size_t range_start = 0; size_t local_position = 0; - for (int p_index = 0; p_index < domain.get_volume(); ++p) { - while (local_position < volume && still smaller or equal) { local_position++; } - - local_partition_size[partition_index++] = local_position - range_start; - range_start = local_position; + for (int p_index = 0; p_index < domain.get_volume(); ++p_index) { + // move as long current value is lesser or equaöl to current splitter + while (local_position < volume && + (inptr[local_position] < splitters[p_index].value || + (inptr[local_position] == splitters[p_index].value && + (local_rank < splitters[p_index].rank || + (local_rank == splitters[p_index].rank && + local_position <= splitters[p_index].local_id))))) { + local_position++; + } + + local_partition_size[p_index++] = local_position - range_start; + range_start = local_position; } } From b115835512ceba71a215c0338497b96473c81c30 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:03:50 +0100 Subject: [PATCH 11/49] OpenMP non-distributed implementation, some small fixes, benchmark tool --- examples/sort.py | 102 +++++++++++++++++++++++++++++++++ src/cunumeric/sort/sort.cc | 17 +++--- src/cunumeric/sort/sort.cu | 9 ++- src/cunumeric/sort/sort_omp.cc | 72 ++++++++++++++++++++++- tests/sort.py | 4 +- 5 files changed, 191 insertions(+), 13 deletions(-) create mode 100644 examples/sort.py diff --git a/examples/sort.py b/examples/sort.py new file mode 100644 index 000000000..9142c8a12 --- /dev/null +++ b/examples/sort.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# Copyright 2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import datetime + +import numpy +from benchmark import run_benchmark + +import cunumeric + + +def check_sorted(a, a_numpy): + a_sorted = numpy.sort(a_numpy) + print("Checking result...") + if cunumeric.allclose(a_sorted, a): + print("PASS!") + else: + print("FAIL!") + print("NUMPY : " + str(a_sorted)) + print("CUNUMERIC: " + str(a)) + + +def run_sort(N, perform_check, timing): + + numpy.random.seed(42) + a_numpy = numpy.array( + numpy.random.randint(1000, size=N), dtype=numpy.int32 + ) + a = cunumeric.array(a_numpy) + + start = datetime.datetime.now() + a_sorted = cunumeric.sort(a) + stop = datetime.datetime.now() + + if perform_check: + check_sorted(a_sorted, a_numpy) + else: + # do we need to synchronize? + assert True + delta = stop - start + total = delta.total_seconds() * 1000.0 + if timing: + print("Elapsed Time: " + str(total) + " ms") + return total + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--check", + dest="check", + action="store_true", + help="check the result of the solve", + ) + parser.add_argument( + "-n", + "--num", + type=int, + default=1000000, + dest="N", + help="number of elements in one dimension", + ) + parser.add_argument( + "-t", + "--time", + dest="timing", + action="store_true", + help="perform timing", + ) + parser.add_argument( + "-b", + "--benchmark", + type=int, + default=1, + dest="benchmark", + help="number of times to benchmark this application (default 1 - " + "normal execution)", + ) + + args = parser.parse_args() + run_benchmark( + run_sort, + args.benchmark, + "Sort", + (args.N, args.check, args.timing), + ) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 28cf83c23..af96acdba 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -35,12 +35,13 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point = - // " - // << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << - // std::endl; +#ifdef DEBUG_CUNUMERIC + std::cout << "CPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; +#endif - std::sort(inptr, inptr + volume); + std::stable_sort(inptr, inptr + volume); // in case of distributed data we need to switch to sample sort if (is_index_space) { @@ -71,9 +72,9 @@ struct SortImplBody { std::unique_ptr[]> global_samples(new SampleEntry[num_global_samples]); // sort all samples (utilize 2nd and 3rd sort criteria as well) - std::sort(&(global_samples[0]), - &(global_samples[0]) + num_global_samples, - SampleEntryComparator()); + std::stable_sort(&(global_samples[0]), + &(global_samples[0]) + num_global_samples, + SampleEntryComparator()); // define splitters auto splitters = std::make_unique[]>(domain.get_volume() - 1); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index f76b2871c..2ce1987ad 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -39,7 +39,14 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - thrust::sort(inptr, inptr + volume); +#ifdef DEBUG_CUNUMERIC + std::cout << "GPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; +#endif + + thrust::device_ptr dev_ptr(inptr); + thrust::stable_sort(dev_ptr, dev_ptr + volume); // in case of distributed data we need to switch to sample sort if (is_index_space) { diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index d8ffadbd0..1f2d00262 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -24,6 +24,51 @@ namespace cunumeric { using namespace Legion; using namespace legate; +template +void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp) +{ + const size_t mid = (end_idx + start_idx) / 2; + size_t left_idx = start_idx; + size_t right_idx = mid; + size_t target_idx = start_idx; + + while (left_idx < mid && right_idx < end_idx) { + if (inptr[left_idx] <= inptr[right_idx]) { + tmp[target_idx++] = inptr[left_idx++]; + } else { + tmp[target_idx++] = inptr[right_idx++]; + } + } + + while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; } + while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; } + + std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx); +} + +// TODO tune +#define SEQUENTIAL_THRESHOLD 1024 +#define TASK_THRESHOLD 2048 + +template +void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp) +{ + const size_t size = end_idx - start_idx + 1; + if (size > SEQUENTIAL_THRESHOLD) { + const size_t mid = (end_idx + start_idx) / 2; + +#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) + merge_sort(inptr, start_idx, mid, tmp); +#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) + merge_sort(inptr, mid, end_idx, tmp); + +#pragma omp taskwait + merge(inptr, start_idx, end_idx, tmp); + } else if (size > 1) { + std::stable_sort(inptr + start_idx, inptr + end_idx); + } +} + template struct SortImplBody { using VAL = legate_type_of; @@ -36,8 +81,31 @@ struct SortImplBody { Legion::DomainPoint index_point, Legion::Domain domain) { - // not implemented yet - assert(false); +#ifdef DEBUG_CUNUMERIC + std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested() + << "): local size = " << volume << ", dist. = " << is_index_space + << ", index_point = " << index_point << ", domain/volume = " << domain << "/" + << domain.get_volume() << std::endl; +#endif + + bool nested = omp_get_nested(); + if (!nested) omp_set_nested(1); + + // merge sort + auto tmp = std::make_unique(volume); + +#pragma omp parallel shared(inptr, tmp) + { +#pragma omp single + merge_sort(inptr, 0, volume, &(tmp[0])); + } + + if (is_index_space) { + // not implemented yet + assert(false); + } + + if (!nested) omp_set_nested(0); } }; diff --git a/tests/sort.py b/tests/sort.py index ab5c91193..b8945d19d 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -31,11 +31,11 @@ def test(): # pdb.set_trace() sortA_num = num.sort(A_num) print("Result cunumeric: " + str(sortA_num)) + assert num.allclose(sortA_np, sortA_num) A_num.sort() print("Result (inplace): " + str(A_num)) - - assert num.allclose(sortA_np, sortA_num) + assert num.allclose(sortA_np, A_num) return From 03608cfaf7ca2903d41063115d0ebf01782412d3 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:10:51 +0100 Subject: [PATCH 12/49] added missing include --- src/cunumeric/sort/sort.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 2ce1987ad..5530de63d 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include "cunumeric/cuda_help.h" From 85bc3a73c5ca59adf74a2c939724daadb70bcdd6 Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Tue, 8 Feb 2022 21:48:03 +0100 Subject: [PATCH 13/49] switch to parallel gcc sort --- src/cunumeric/sort/sort_omp.cc | 60 ++-------------------------------- 1 file changed, 2 insertions(+), 58 deletions(-) diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 1f2d00262..030d26cf0 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -17,6 +17,7 @@ #include "cunumeric/sort/sort.h" #include "cunumeric/sort/sort_template.inl" +#include #include namespace cunumeric { @@ -24,51 +25,6 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template -void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp) -{ - const size_t mid = (end_idx + start_idx) / 2; - size_t left_idx = start_idx; - size_t right_idx = mid; - size_t target_idx = start_idx; - - while (left_idx < mid && right_idx < end_idx) { - if (inptr[left_idx] <= inptr[right_idx]) { - tmp[target_idx++] = inptr[left_idx++]; - } else { - tmp[target_idx++] = inptr[right_idx++]; - } - } - - while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; } - while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; } - - std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx); -} - -// TODO tune -#define SEQUENTIAL_THRESHOLD 1024 -#define TASK_THRESHOLD 2048 - -template -void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp) -{ - const size_t size = end_idx - start_idx + 1; - if (size > SEQUENTIAL_THRESHOLD) { - const size_t mid = (end_idx + start_idx) / 2; - -#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) - merge_sort(inptr, start_idx, mid, tmp); -#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD) - merge_sort(inptr, mid, end_idx, tmp); - -#pragma omp taskwait - merge(inptr, start_idx, end_idx, tmp); - } else if (size > 1) { - std::stable_sort(inptr + start_idx, inptr + end_idx); - } -} - template struct SortImplBody { using VAL = legate_type_of; @@ -88,24 +44,12 @@ struct SortImplBody { << domain.get_volume() << std::endl; #endif - bool nested = omp_get_nested(); - if (!nested) omp_set_nested(1); - - // merge sort - auto tmp = std::make_unique(volume); - -#pragma omp parallel shared(inptr, tmp) - { -#pragma omp single - merge_sort(inptr, 0, volume, &(tmp[0])); - } + __gnu_parallel::stable_sort(inptr, inptr + volume); if (is_index_space) { // not implemented yet assert(false); } - - if (!nested) omp_set_nested(0); } }; From 188077bf010de4c8b4b002b2bca79d01540de375 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 10 Feb 2022 09:16:31 -0800 Subject: [PATCH 14/49] Enable N-D non-distributed sort --- cunumeric/array.py | 11 +--- cunumeric/deferred.py | 37 ++++++++++--- cunumeric/module.py | 9 ++-- src/cunumeric/sort/sort.cc | 15 ++++-- src/cunumeric/sort/sort.cu | 7 ++- src/cunumeric/sort/sort.h | 1 + src/cunumeric/sort/sort_omp.cc | 15 +++++- src/cunumeric/sort/sort_template.inl | 31 +++++++++-- tests/sort.py | 77 +++++++++++++++++++++++++++- 9 files changed, 171 insertions(+), 32 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 0becb1712..85a9f4664 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1500,16 +1500,9 @@ def sort(self, axis=-1, kind="stable", order=None): if self._thunk.scalar: # nothing to do return - elif self.ndim == 1: - # this is the default -- sorting of 1D array - self._thunk.sort(axis=axis) - return else: - raise NotImplementedError( - "cuNumeric only supports sorting 1D arrays at the moment" - ) - - # no return value + # this is the default -- sorting of N-D array + self._thunk.sort(axis=axis) return def squeeze(self, axis=None): diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 871f32c1a..3edf8c75c 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1519,9 +1519,34 @@ def cholesky(self, src, no_tril=False): self.trilu(self, 0, True) def sort(self, axis=-1, kind="stable", order=None): - # TODO support axis parameter - self.runtime.legate_runtime.issue_execution_fence(block=True) - task = self.context.create_task(CuNumericOpCode.SORT) - task.add_output(self.base) - task.execute() - self.runtime.legate_runtime.issue_execution_fence(block=True) + axis_normalized = axis + if axis_normalized < 0: + axis_normalized = self.ndim + axis + + if axis_normalized is not self.ndim - 1: + assert axis_normalized < self.ndim - 1 and axis_normalized >= 0 + + # swap axes + swapped = self.swapaxes(axis_normalized, self.ndim - 1) + + # FIXME: ensure *new* distribution does not split last axis (!) + swapped_copy = self.runtime.create_empty_thunk( + swapped.shape, dtype=self.dtype, inputs=[self, swapped] + ) + swapped_copy.copy(swapped, deep=True) + + # run sort on last axis + swapped_copy.sort(self.ndim - 1) + + self.base = swapped_copy.swapaxes( + axis_normalized, self.ndim - 1 + ).base + self.numpy_array = None + else: + # run actual sort task + self.runtime.legate_runtime.issue_execution_fence(block=True) + task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) + task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64) + task.execute() + self.runtime.legate_runtime.issue_execution_fence(block=True) diff --git a/cunumeric/module.py b/cunumeric/module.py index 55998d448..fce223979 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5849,9 +5849,10 @@ def bincount(a, weights=None, minlength=0): # Sorting + +@add_boilerplate("a") def argsort(a, axis=-1, kind="stable", order=None): - array = ndarray.convert_to_cunumeric_ndarray(a) - return array.argsort(axis=axis, kind=kind, order=order) + return a.argsort(axis=axis, kind=kind, order=order) def lexsort(a, axis=-1): @@ -5862,9 +5863,9 @@ def msort(a): return sort(a) +@add_boilerplate("a") def sort(a, axis=-1, kind="stable", order=None): - array = ndarray.convert_to_cunumeric_ndarray(a) - out = array.copy() + out = a.copy() out_array = ndarray.convert_to_cunumeric_ndarray(out) out_array._thunk.sort(axis=axis, kind=kind, order=order) return out_array diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index af96acdba..4d26ce1ad 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -31,6 +31,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -39,12 +40,20 @@ struct SortImplBody { std::cout << "CPU(" << index_point[0] << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; + + if (volume <= 30) { + std::cout << "inptr = [ "; + for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; } + std::cout << "]" << std::endl; + } #endif - std::stable_sort(inptr, inptr + volume); + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } - // in case of distributed data we need to switch to sample sort - if (is_index_space) { + // in case of distributed data (1D) we need to switch to sample sort + if (is_index_space && DIM == 1) { // create (starting) sample of (at most) domain.get_volume() equidistant values // also enrich values with additional indexes rank & local position in order to handle // duplicate values diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 5530de63d..f43445a8b 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -36,6 +36,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -47,10 +48,12 @@ struct SortImplBody { #endif thrust::device_ptr dev_ptr(inptr); - thrust::stable_sort(dev_ptr, dev_ptr + volume); + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size); + } // in case of distributed data we need to switch to sample sort - if (is_index_space) { + if (is_index_space && DIM == 1) { // not implemented yet assert(false); } diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index 8c3f5a0df..febc1f57c 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -22,6 +22,7 @@ namespace cunumeric { struct SortArgs { Array& output; + size_t sort_dim_size; bool is_index_space; Legion::DomainPoint index_point; Legion::Domain domain; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 030d26cf0..6c26f07ae 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -33,6 +33,7 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -44,9 +45,19 @@ struct SortImplBody { << domain.get_volume() << std::endl; #endif - __gnu_parallel::stable_sort(inptr, inptr + volume); + if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune + { +#pragma omp do schedule(dynamic) + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } + } else { + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + } + } - if (is_index_space) { + if (is_index_space && DIM == 1) { // not implemented yet assert(false); } diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 5355bdbbe..b1922f014 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -36,13 +36,33 @@ struct SortImpl { Pitches pitches; size_t volume = pitches.flatten(rect); - // TODO -- we cannot stop! need to proceed as partition might be filled later - if (volume == 0) { return; } - auto inout = args.output.read_write_accessor(rect); - SortImplBody()( - inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain); + /* + * Assumptions: + * 1. Sort is always requested for the 'last' dimension within rect + * 2. We have product_of_all_other_dimensions independent sort ranges + * 3. if we have more than one participants: + * a) 1D-case: we need to perform parallel sort (e.g. via sampling) + * b) ND-case: rect needs to be the full domain in that last dimension + */ + +#ifdef DEBUG_CUNUMERIC + std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size + << std::endl; + + assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && + "multi-dimensional array should not be distributed in last (sort) dimension"); +#endif + + SortImplBody()(inout.ptr(rect), + pitches, + rect, + volume, + args.sort_dim_size, + args.is_index_space, + args.index_point, + args.domain); } }; @@ -50,6 +70,7 @@ template static void sort_template(TaskContext& context) { SortArgs args{context.outputs()[0], + context.scalars()[0].value(), context.task_->is_index_space, context.task_->index_point, context.task_->index_domain}; diff --git a/tests/sort.py b/tests/sort.py index b8945d19d..bdc4c4b93 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -18,7 +18,17 @@ import cunumeric as num -def test(): +def test_sort_axis(a_np, a_num, axis): + assert num.allclose(a_np, a_num) + print("Sorting axis " + str(axis) + ":") + sort_np = np.sort(a_np, axis) + sort_num = num.sort(a_num, axis, kind="merge") + # print(sort_np) + # print(sort_num) + assert num.allclose(sort_np, sort_num) + + +def test_1D(): np.random.seed(42) A_np = np.array(np.random.randint(10, size=30), dtype=np.int32) @@ -40,5 +50,70 @@ def test(): return +def test_2D(): + np.random.seed(42) + x_dim = 5 + y_dim = 3 + A_np = np.array( + np.random.randint(10, size=x_dim * y_dim), dtype=np.int32 + ).reshape(x_dim, y_dim) + + A_num = num.array(A_np) + print("Sorting matrix:\n") + print(A_num) + + test_sort_axis(A_np, A_num, 1) + test_sort_axis(A_np, A_num, 0) + + return + + +def test_3D(): + np.random.seed(42) + x_dim = 5 + y_dim = 3 + z_dim = 7 + A_np = np.array( + np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32 + ).reshape(x_dim, y_dim, z_dim) + + A_num = num.array(A_np) + print("Sorting 3d tensor:\n") + print(A_np) + + test_sort_axis(A_np, A_num, 2) + test_sort_axis(A_np, A_num, 1) + test_sort_axis(A_np, A_num, 0) + + return + + +def test_custom(): + a = np.arange(2 * 4).reshape(2, 4) + a_transpose = np.transpose(a) + + a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]]) + a_num = num.array(a) + a_num_transposed = a_num.swapaxes(0, 1) + + test_sort_axis(a, a_num, 1) + test_sort_axis(a_transpose, a_transposed_num, 1) + test_sort_axis(a_transpose, a_num_transposed, 1) + test_sort_axis(a_transpose, a_num_transposed, 0) + + return + + +def test(): + print("\n\n ----------- Custom test ---------------\n") + test_custom() + print("\n\n ----------- 2D test ---------------\n") + test_2D() + print("\n\n ----------- 3D test ---------------\n") + test_3D() + print("\n\n ----------- 1D test ---------------\n") + test_1D() + + if __name__ == "__main__": test() From 5cd0956763105a2aaf9cde47006f12f19f3f799c Mon Sep 17 00:00:00 2001 From: mfoerste4 Date: Thu, 10 Feb 2022 21:48:48 +0100 Subject: [PATCH 15/49] merge after rebase to 22.03 --- cunumeric/module.py | 88 +++++++++------------------- src/cunumeric/sort/sort.cc | 2 +- src/cunumeric/sort/sort.cu | 2 +- src/cunumeric/sort/sort.h | 2 +- src/cunumeric/sort/sort_omp.cc | 2 +- src/cunumeric/sort/sort_template.inl | 2 +- 6 files changed, 33 insertions(+), 65 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 54d03137b..ffb5eaff5 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5571,6 +5571,34 @@ def sign(a, out=None, where=True, dtype=None, **kwargs): # Sorting, searching, and counting ################################## +# Sorting + + +@add_boilerplate("a") +def argsort(a, axis=-1, kind="stable", order=None): + return a.argsort(axis=axis, kind=kind, order=order) + + +def lexsort(a, axis=-1): + raise NotImplementedError("Not yet implemented") + + +def msort(a): + return sort(a) + + +@add_boilerplate("a") +def sort(a, axis=-1, kind="stable", order=None): + out = a.copy() + out_array = ndarray.convert_to_cunumeric_ndarray(out) + out_array._thunk.sort(axis=axis, kind=kind, order=order) + return out_array + + +def sort_complex(a): + return sort(a) + + # Searching @@ -5846,63 +5874,3 @@ def bincount(a, weights=None, minlength=0): ) out._thunk.bincount(a._thunk, weights=weights._thunk) return out - -# Sorting - - -# Sorting - - -@add_boilerplate("a") -def argsort(a, axis=-1, kind="stable", order=None): - return a.argsort(axis=axis, kind=kind, order=order) - - -def lexsort(a, axis=-1): - raise NotImplementedError("Not yet implemented") - - -def msort(a): - return sort(a) - - -@add_boilerplate("a") -def sort(a, axis=-1, kind="stable", order=None): - out = a.copy() - out_array = ndarray.convert_to_cunumeric_ndarray(out) - out_array._thunk.sort(axis=axis, kind=kind, order=order) - return out_array - - -def sort_complex(a): - return sort(a) - - -# Counting - - -@add_boilerplate("a") -def argsort(a, axis=-1, kind="stable", order=None): - return a.argsort(axis=axis, kind=kind, order=order) - - -def lexsort(a, axis=-1): - raise NotImplementedError("Not yet implemented") - - -def msort(a): - return sort(a) - - -@add_boilerplate("a") -def sort(a, axis=-1, kind="stable", order=None): - out = a.copy() - out_array = ndarray.convert_to_cunumeric_ndarray(out) - out_array._thunk.sort(axis=axis, kind=kind, order=order) - return out_array - - -def sort_complex(a): - return sort(a) - - diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 4d26ce1ad..1f4407b7f 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 NVIDIA Corporation +/* Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index f43445a8b..ff70e26d9 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -1,4 +1,4 @@ -/* Copyright 2021 NVIDIA Corporation +/* Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index febc1f57c..6afba9230 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -1,4 +1,4 @@ -/* Copyright 2021 NVIDIA Corporation +/* Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 6c26f07ae..c7265f447 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 NVIDIA Corporation +/* Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index b1922f014..2b4886471 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -1,4 +1,4 @@ -/* Copyright 2021 NVIDIA Corporation +/* Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 9063d77b04d530d59c6f102454ad18c984f0a7f2 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 16 Feb 2022 02:40:33 -0800 Subject: [PATCH 16/49] added cupy-style sort kernel, support axis=None, improved benchmark --- cunumeric/array.py | 2 +- cunumeric/deferred.py | 70 +++++++++++++++--------- examples/sort.py | 70 ++++++++++++++++++++---- src/cunumeric/sort/sort.cc | 8 ++- src/cunumeric/sort/sort.cu | 60 +++++++++++++++++++-- src/cunumeric/sort/sort.h | 3 +- src/cunumeric/sort/sort_omp.cc | 5 +- src/cunumeric/sort/sort_template.inl | 81 +++++++++++++++++++++++----- tests/sort.py | 24 +++++---- 9 files changed, 255 insertions(+), 68 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 85a9f4664..f128b96e8 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1494,7 +1494,7 @@ def sort(self, axis=-1, kind="stable", order=None): "cuNumeric does not support sorting with 'order' as " "ndarray only supports numeric values" ) - if axis >= self.ndim or axis < -self.ndim: + if axis is not None and (axis >= self.ndim or axis < -self.ndim): raise ValueError("invalid axis") if self._thunk.scalar: diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 3edf8c75c..aaa8f6c26 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1519,34 +1519,52 @@ def cholesky(self, src, no_tril=False): self.trilu(self, 0, True) def sort(self, axis=-1, kind="stable", order=None): - axis_normalized = axis - if axis_normalized < 0: - axis_normalized = self.ndim + axis + if axis is None and self.ndim > 1: + flattened = self.reshape((self.size,), order="C") + flattened_copy = self.runtime.create_empty_thunk( + flattened.shape, dtype=self.dtype, inputs=[self, flattened] + ) + flattened_copy.copy(flattened, deep=True) - if axis_normalized is not self.ndim - 1: - assert axis_normalized < self.ndim - 1 and axis_normalized >= 0 + # run sort on last axis -- return 1D solution + flattened_copy.sort() + self.base = flattened_copy.base + self.numpy_array = None + else: + if axis is None: + sort_axis = 0 + elif axis < 0: + sort_axis = self.ndim + axis + else: + sort_axis = axis - # swap axes - swapped = self.swapaxes(axis_normalized, self.ndim - 1) + if sort_axis is not self.ndim - 1: + assert sort_axis < self.ndim - 1 and sort_axis >= 0 - # FIXME: ensure *new* distribution does not split last axis (!) - swapped_copy = self.runtime.create_empty_thunk( - swapped.shape, dtype=self.dtype, inputs=[self, swapped] - ) - swapped_copy.copy(swapped, deep=True) + # swap axes + swapped = self.swapaxes(sort_axis, self.ndim - 1) - # run sort on last axis - swapped_copy.sort(self.ndim - 1) + swapped_copy = self.runtime.create_empty_thunk( + swapped.shape, dtype=self.dtype, inputs=[self, swapped] + ) + swapped_copy.copy(swapped, deep=True) - self.base = swapped_copy.swapaxes( - axis_normalized, self.ndim - 1 - ).base - self.numpy_array = None - else: - # run actual sort task - self.runtime.legate_runtime.issue_execution_fence(block=True) - task = self.context.create_task(CuNumericOpCode.SORT) - task.add_output(self.base) - task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64) - task.execute() - self.runtime.legate_runtime.issue_execution_fence(block=True) + # run sort on last axis + swapped_copy.sort(self.ndim - 1) + + self.base = swapped_copy.swapaxes( + self.ndim - 1, sort_axis + ).base + self.numpy_array = None + else: + # run actual sort task + self.runtime.legate_runtime.issue_execution_fence(block=True) + task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) + task.add_input(self.base) + if self.ndim > 1: + task.add_broadcast(self.base, self.ndim - 1) + task.add_scalar_arg(self.ndim - 1, ty.int32) + task.add_scalar_arg(self.base.shape, (ty.int32,)) + task.execute() + self.runtime.legate_runtime.issue_execution_fence(block=True) diff --git a/examples/sort.py b/examples/sort.py index 9142c8a12..21b503708 100644 --- a/examples/sort.py +++ b/examples/sort.py @@ -24,8 +24,8 @@ import cunumeric -def check_sorted(a, a_numpy): - a_sorted = numpy.sort(a_numpy) +def check_sorted(a, a_numpy, axis=-1): + a_sorted = numpy.sort(a_numpy, axis) print("Checking result...") if cunumeric.allclose(a_sorted, a): print("PASS!") @@ -35,20 +35,40 @@ def check_sorted(a, a_numpy): print("CUNUMERIC: " + str(a)) -def run_sort(N, perform_check, timing): +def run_sort(N, shape, axis, datatype, perform_check, timing): numpy.random.seed(42) - a_numpy = numpy.array( - numpy.random.randint(1000, size=N), dtype=numpy.int32 - ) + newtype = numpy.dtype(datatype).type + + if numpy.issubdtype(newtype, numpy.integer): + a_numpy = numpy.array( + numpy.random.randint( + numpy.iinfo(newtype).min, numpy.iinfo(newtype).max, size=N + ), + dtype=newtype, + ) + elif numpy.issubdtype(newtype, numpy.floating): + a_numpy = numpy.array(numpy.random.random(size=N), dtype=newtype) + elif numpy.issubdtype(newtype, numpy.complexfloating): + a_numpy = numpy.array( + numpy.random.random(size=N) + numpy.random.random(size=N) * 1j, + dtype=newtype, + ) + else: + print("UNKNOWN type " + str(newtype)) + assert False + + if shape is not None: + a_numpy = a_numpy.reshape(tuple(shape)) + a = cunumeric.array(a_numpy) start = datetime.datetime.now() - a_sorted = cunumeric.sort(a) + a_sorted = cunumeric.sort(a, axis) stop = datetime.datetime.now() if perform_check: - check_sorted(a_sorted, a_numpy) + check_sorted(a_sorted, a_numpy, axis) else: # do we need to synchronize? assert True @@ -83,6 +103,31 @@ def run_sort(N, perform_check, timing): action="store_true", help="perform timing", ) + parser.add_argument( + "-s", + "--shape", + type=int, + nargs="+", + default=None, + dest="shape", + help="array reshape (default 'None')", + ) + parser.add_argument( + "-d", + "--datatype", + type=str, + default="uint32", + dest="datatype", + help="data type (default numpy.int32)", + ) + parser.add_argument( + "-a", + "--axis", + type=int, + default=-1, + dest="axis", + help="sort axis (default -1)", + ) parser.add_argument( "-b", "--benchmark", @@ -98,5 +143,12 @@ def run_sort(N, perform_check, timing): run_sort, args.benchmark, "Sort", - (args.N, args.check, args.timing), + ( + args.N, + args.shape, + args.axis, + args.datatype, + args.check, + args.timing, + ), ) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 1f4407b7f..33bd9e83b 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -31,7 +31,8 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, - const size_t sort_dim_size, + const uint32_t sort_axis, + Legion::DomainPoint global_shape, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -47,13 +48,16 @@ struct SortImplBody { std::cout << "]" << std::endl; } #endif - + const size_t sort_dim_size = global_shape[sort_axis]; for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); } // in case of distributed data (1D) we need to switch to sample sort if (is_index_space && DIM == 1) { + // not implemented yet + assert(false); + // create (starting) sample of (at most) domain.get_volume() equidistant values // also enrich values with additional indexes rank & local position in order to handle // duplicate values diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index ff70e26d9..f48f9a079 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include "cunumeric/cuda_help.h" @@ -36,7 +38,8 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, - const size_t sort_dim_size, + const uint32_t sort_axis, + Legion::DomainPoint global_shape, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -47,9 +50,60 @@ struct SortImplBody { << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; #endif + const size_t sort_dim_size = global_shape[sort_axis]; thrust::device_ptr dev_ptr(inptr); - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size); + + // same approach as cupy implemntation --> combine multiple individual sorts into single + // kernel with data tuples - (id_sub-sort, actual_data) + if (DIM == 1) { + thrust::stable_sort(dev_ptr, dev_ptr + volume); + } else { + // in this case we know we are sorting for the *last* index + const uint64_t max_elements_per_kernel = + 1 << 22; // TODO check amount of available GPU memory from config + const uint64_t number_sorts_per_kernel = + std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); + const uint64_t number_sorts = volume / sort_dim_size; + + // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; + + if (number_sorts_per_kernel >= + 32) // key-tuple sort has quite some overhead -- only utilize if beneficial + { + // allocate memory for keys (iterating +=1 for each individual sort dimension) + // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! + // TODO!!!! + auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, + Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); + + for (uint64_t sort_part = 0; sort_part < number_sorts; + sort_part += number_sorts_per_kernel) { + // compute size of batch (might be smaller for the last call) + const uint64_t num_elements = + std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; + const uint64_t offset = sort_part * sort_dim_size; + + // reinit keys + thrust::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_elements), + thrust::make_constant_iterator(sort_dim_size), + dev_key_ptr, + thrust::divides()); + + // sort + auto combined = + thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_ptr + offset)); + thrust::stable_sort( + combined, combined + num_elements, thrust::less>()); + } + } else { + // number_sorts_per_kernel == 1 ----> we don't need keys + for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { + const uint64_t offset = sort_part * sort_dim_size; + thrust::stable_sort(dev_ptr + offset, dev_ptr + offset + sort_dim_size); + } + } } // in case of distributed data we need to switch to sample sort diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index 6afba9230..cc90ff21d 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -22,7 +22,8 @@ namespace cunumeric { struct SortArgs { Array& output; - size_t sort_dim_size; + uint32_t sort_axis; + Legion::DomainPoint global_shape; bool is_index_space; Legion::DomainPoint index_point; Legion::Domain domain; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index c7265f447..659e5c138 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -33,7 +33,8 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, - const size_t sort_dim_size, + const uint32_t sort_axis, + Legion::DomainPoint global_shape, bool is_index_space, Legion::DomainPoint index_point, Legion::Domain domain) @@ -44,7 +45,7 @@ struct SortImplBody { << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; #endif - + const size_t sort_dim_size = global_shape[sort_axis]; if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune { #pragma omp do schedule(dynamic) diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 2b4886471..81c020912 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -33,6 +33,10 @@ struct SortImpl { auto rect = args.output.shape(); + // we shall not return on empty rectangle in case of distributed data + // as the process might still participate in the parallel sort + if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; + Pitches pitches; size_t volume = pitches.flatten(rect); @@ -45,32 +49,83 @@ struct SortImpl { * 3. if we have more than one participants: * a) 1D-case: we need to perform parallel sort (e.g. via sampling) * b) ND-case: rect needs to be the full domain in that last dimension + * + * FIXME: understand legion-dim != ndarray-dim case + * + * */ #ifdef DEBUG_CUNUMERIC - std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size - << std::endl; + std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape + << ", axis=" << args.sort_axis + << ", sort_dim_size=" << args.global_shape[args.sort_axis] << std::endl; + + assert((DIM == 1 || (rect.hi[args.sort_axis] - rect.lo[args.sort_axis] + 1 == + args.global_shape[args.sort_axis])) && + "multi-dimensional array should not be distributed in (sort) dimension"); +#endif - assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && - "multi-dimensional array should not be distributed in last (sort) dimension"); +#ifndef LEGION_BOUNDS_CHECKS + bool dense = inout.accessor.is_dense_row_major(rect); +#else + bool dense = false; #endif - SortImplBody()(inout.ptr(rect), - pitches, - rect, - volume, - args.sort_dim_size, - args.is_index_space, - args.index_point, - args.domain); + if (dense) { + SortImplBody()(inout.ptr(rect), + pitches, + rect, + volume, + args.sort_axis, + args.global_shape, + args.is_index_space, + args.index_point, + args.domain); + } else { + // NOTE: we might want to place this loop logic in the different KIND-implementations in + // norder to re-use buffers + + assert(!args.is_index_space || DIM > 1); + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < args.global_shape[i]) { break; } + } + + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + // RUN based on current start point + SortImplBody()(&(inout[start_point]), + pitches, + rect, + contiguous_elements, + args.sort_axis, + args.global_shape, + args.is_index_space, + args.index_point, + args.domain); + elements_processed += contiguous_elements; + } + } } }; template static void sort_template(TaskContext& context) { + DomainPoint global_shape; + { + auto shape_span = context.scalars()[1].values(); + global_shape.dim = shape_span.size(); + for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; } + } + SortArgs args{context.outputs()[0], - context.scalars()[0].value(), + context.scalars()[0].value(), + global_shape, context.task_->is_index_space, context.task_->index_point, context.task_->index_domain}; diff --git a/tests/sort.py b/tests/sort.py index bdc4c4b93..85f17ed3b 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -22,10 +22,11 @@ def test_sort_axis(a_np, a_num, axis): assert num.allclose(a_np, a_num) print("Sorting axis " + str(axis) + ":") sort_np = np.sort(a_np, axis) - sort_num = num.sort(a_num, axis, kind="merge") - # print(sort_np) - # print(sort_num) - assert num.allclose(sort_np, sort_num) + sort_num = num.sort(a_num, axis) + if not num.allclose(sort_np, sort_num): + print(sort_np) + print(sort_num) + assert False def test_1D(): @@ -64,6 +65,7 @@ def test_2D(): test_sort_axis(A_np, A_num, 1) test_sort_axis(A_np, A_num, 0) + test_sort_axis(A_np, A_num, axis=None) return @@ -84,22 +86,22 @@ def test_3D(): test_sort_axis(A_np, A_num, 2) test_sort_axis(A_np, A_num, 1) test_sort_axis(A_np, A_num, 0) + test_sort_axis(A_np, A_num, axis=None) return def test_custom(): - a = np.arange(2 * 4).reshape(2, 4) - a_transpose = np.transpose(a) + a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape( + 4, 4, 5, 2, 3, 2, 2, 2, 4 + ) - a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]]) a_num = num.array(a) - a_num_transposed = a_num.swapaxes(0, 1) test_sort_axis(a, a_num, 1) - test_sort_axis(a_transpose, a_transposed_num, 1) - test_sort_axis(a_transpose, a_num_transposed, 1) - test_sort_axis(a_transpose, a_num_transposed, 0) + test_sort_axis(a, a_num, 2) + test_sort_axis(a, a_num, 7) + test_sort_axis(a, a_num, 4) return From 5e982c2e53e9b09a5fd430d8e71b4ef70174cec9 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 17 Feb 2022 12:07:08 -0800 Subject: [PATCH 17/49] refactoring and documentation --- cunumeric/array.py | 25 +---- cunumeric/deferred.py | 78 +++++++++++----- cunumeric/eager.py | 8 +- cunumeric/module.py | 126 +++++++++++++++++++++++-- src/cunumeric/mapper.cc | 8 ++ src/cunumeric/sort/sort.cc | 135 +++++++-------------------- src/cunumeric/sort/sort.cu | 72 ++++++++------ src/cunumeric/sort/sort.h | 4 +- src/cunumeric/sort/sort_omp.cc | 63 ++++++++----- src/cunumeric/sort/sort_template.inl | 79 ++++++---------- tests/sort.py | 76 +++++++++++++-- 11 files changed, 405 insertions(+), 269 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index f128b96e8..6c5a598f4 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1481,29 +1481,8 @@ def setflags(self, write=None, align=None, uic=None): ) def sort(self, axis=-1, kind="stable", order=None): - if kind != "stable": - runtime.warn( - "cuNumeric uses a different (stable) algorithm than " - + str(kind) - + " for sorting", - category=RuntimeWarning, - stacklevel=2, - ) - if order is not None: - raise NotImplementedError( - "cuNumeric does not support sorting with 'order' as " - "ndarray only supports numeric values" - ) - if axis is not None and (axis >= self.ndim or axis < -self.ndim): - raise ValueError("invalid axis") - - if self._thunk.scalar: - # nothing to do - return - else: - # this is the default -- sorting of N-D array - self._thunk.sort(axis=axis) - return + self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order) + return def squeeze(self, axis=None): if axis is not None: diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index aaa8f6c26..311753138 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1518,53 +1518,89 @@ def cholesky(self, src, no_tril=False): if not no_tril: self.trilu(self, 0, True) - def sort(self, axis=-1, kind="stable", order=None): - if axis is None and self.ndim > 1: - flattened = self.reshape((self.size,), order="C") + @auto_convert([1]) + def sort(self, rhs, axis=-1, kind="stable", order=None): + + if kind != "stable": + self.runtime.warn( + "cuNumeric uses a different (stable) algorithm than " + + str(kind) + + " for sorting", + category=RuntimeWarning, + stacklevel=2, + ) + if order is not None: + raise NotImplementedError( + "cuNumeric does not support sorting with 'order' as " + "ndarray only supports numeric values" + ) + if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim): + raise ValueError("invalid axis") + + if axis is None and rhs.ndim > 1: + flattened = rhs.reshape((rhs.size,), order="C") flattened_copy = self.runtime.create_empty_thunk( - flattened.shape, dtype=self.dtype, inputs=[self, flattened] + flattened.shape, dtype=rhs.dtype, inputs=[rhs, flattened] ) flattened_copy.copy(flattened, deep=True) - # run sort on last axis -- return 1D solution - flattened_copy.sort() + # run sort flattened -- return 1D solution + flattened_copy.sort(flattened_copy) self.base = flattened_copy.base self.numpy_array = None + else: if axis is None: sort_axis = 0 elif axis < 0: - sort_axis = self.ndim + axis + sort_axis = rhs.ndim + axis else: sort_axis = axis - if sort_axis is not self.ndim - 1: - assert sort_axis < self.ndim - 1 and sort_axis >= 0 + if sort_axis is not rhs.ndim - 1: + assert sort_axis < rhs.ndim - 1 and sort_axis >= 0 # swap axes - swapped = self.swapaxes(sort_axis, self.ndim - 1) + swapped = rhs.swapaxes(sort_axis, rhs.ndim - 1) swapped_copy = self.runtime.create_empty_thunk( - swapped.shape, dtype=self.dtype, inputs=[self, swapped] + swapped.shape, dtype=rhs.dtype, inputs=[rhs, swapped] ) swapped_copy.copy(swapped, deep=True) # run sort on last axis - swapped_copy.sort(self.ndim - 1) + swapped_copy.sort(swapped_copy) - self.base = swapped_copy.swapaxes( - self.ndim - 1, sort_axis - ).base + self.base = swapped_copy.swapaxes(rhs.ndim - 1, sort_axis).base self.numpy_array = None + else: # run actual sort task - self.runtime.legate_runtime.issue_execution_fence(block=True) + needs_communication = self.runtime.num_gpus > 1 or ( + self.runtime.num_gpus == 0 and self.runtime.num_procs > 1 + ) + + if needs_communication: + self.runtime.legate_runtime.issue_execution_fence( + block=True + ) + task = self.context.create_task(CuNumericOpCode.SORT) task.add_output(self.base) - task.add_input(self.base) + task.add_input(rhs.base) + task.add_alignment(self.base, rhs.base) if self.ndim > 1: - task.add_broadcast(self.base, self.ndim - 1) - task.add_scalar_arg(self.ndim - 1, ty.int32) - task.add_scalar_arg(self.base.shape, (ty.int32,)) + task.add_broadcast(rhs.base, rhs.ndim - 1) + elif needs_communication: + # print("Distributed 1D sort --> broadcast") + task.add_broadcast(rhs.base) + + task.add_scalar_arg(False, bool) # descending flag + task.add_scalar_arg(False, bool) # return indices flag + task.add_scalar_arg(rhs.base.shape, (ty.int32,)) task.execute() - self.runtime.legate_runtime.issue_execution_fence(block=True) + + if needs_communication: + self.runtime.legate_runtime.issue_execution_fence( + block=True + ) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index 0ebdd8959..520287a6f 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -502,12 +502,12 @@ def nonzero(self): result += (EagerArray(self.runtime, array),) return result - def sort(self, axis=-1, kind="stable", order=None): - self.check_eager_args(axis, kind, order) + def sort(self, rhs, axis=-1, kind="stable", order=None): + self.check_eager_args(rhs, axis, kind, order) if self.deferred is not None: - self.deferred.sort(axis, kind, order) + self.deferred.sort(rhs, axis, kind, order) else: - self.array.sort(axis, kind, order) + self.array = np.sort(rhs.array, axis, kind, order) def random_uniform(self): if self.deferred is not None: diff --git a/cunumeric/module.py b/cunumeric/module.py index ffb5eaff5..17728b9b0 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5576,27 +5576,135 @@ def sign(a, out=None, where=True, dtype=None, **kwargs): @add_boilerplate("a") def argsort(a, axis=-1, kind="stable", order=None): - return a.argsort(axis=axis, kind=kind, order=order) + """ + Returns the indices that would sort an array. -def lexsort(a, axis=-1): - raise NotImplementedError("Not yet implemented") + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + Axis to sort. By default, the index -1 (the last axis) is used. If + None, the flattened array is used. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Currently only 'stable' sort is supported + order : str or list of str, optional + Currently not supported + + Returns + ------- + index_array : ndarray of ints + Array of indices that sort a along the specified axis. It has the + same shape as `a.shape` or is flattened in case of `axis` is None. + + See Also + -------- + numpy.argsort + + Availability + -------- + GPU, CPU + """ + return a.argsort(axis=axis, kind=kind, order=order) def msort(a): - return sort(a) + """ + + Returns a sorted copy of an array sorted along the first axis. + + Parameters + ---------- + a : array_like + Input array. + + Returns + ------- + out : ndarray + Sorted array with same dtype and shape as `a`. + + See Also + -------- + numpy.msort + + Availability + -------- + GPU, CPU + """ + return sort(a, axis=0) @add_boilerplate("a") def sort(a, axis=-1, kind="stable", order=None): - out = a.copy() - out_array = ndarray.convert_to_cunumeric_ndarray(out) - out_array._thunk.sort(axis=axis, kind=kind, order=order) - return out_array + """ + Returns a sorted copy of an array. + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + Axis to sort. By default, the index -1 (the last axis) is used. If + None, the flattened array is used. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Currently only 'stable' sort is supported + order : str or list of str, optional + Currently not supported + + Returns + ------- + out : ndarray + Sorted array with same dtype and shape as `a`. In case `axis` is + None the result is flattened. + + See Also + -------- + numpy.sort + + Availability + -------- + GPU, CPU + """ + result = ndarray(a.shape, a.dtype) + result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order) + return result + + +@add_boilerplate("a") def sort_complex(a): - return sort(a) + """ + + Returns a sorted copy of an array sorted along the last axis. Sorts the + real part first, the imaginary part second. + + Parameters + ---------- + a : array_like + Input array. + + Returns + ------- + out : ndarray, complex + Sorted array with same shape as `a`. + + See Also + -------- + numpy.sort_complex + + Availability + -------- + GPU, CPU + """ + + # force complex result + if np.issubdtype(a.dtype, np.complexfloating): + out = a + else: + out = a.astype(np.complex64, copy=True) + + return sort(out) # Searching diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index 0be299982..3452bc78d 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -115,6 +115,14 @@ std::vector CuNumericMapper::store_mappings( } return std::move(mappings); } + case CUNUMERIC_SORT: { + std::vector mappings; + auto& inputs = task.inputs(); + mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front())); + mappings.back().policy.ordering.c_order(); + mappings.back().policy.exact = true; + return std::move(mappings); + } default: { return {}; } diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 33bd9e83b..44a6ba3e6 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -27,118 +27,51 @@ template struct SortImplBody { using VAL = legate_type_of; - void operator()(VAL* inptr, + void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) + { + std::copy(inptr, inptr + volume, outptr); + for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); + } + } + + void operator()(AccessorRO input, + AccessorWO output, const Pitches& pitches, const Rect& rect, + const bool dense, const size_t volume, - const uint32_t sort_axis, - Legion::DomainPoint global_shape, - bool is_index_space, - Legion::DomainPoint index_point, - Legion::Domain domain) + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) { #ifdef DEBUG_CUNUMERIC std::cout << "CPU(" << index_point[0] << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; - - if (volume <= 30) { - std::cout << "inptr = [ "; - for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; } - std::cout << "]" << std::endl; - } + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; #endif - const size_t sort_dim_size = global_shape[sort_axis]; - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); - } - - // in case of distributed data (1D) we need to switch to sample sort - if (is_index_space && DIM == 1) { - // not implemented yet - assert(false); - - // create (starting) sample of (at most) domain.get_volume() equidistant values - // also enrich values with additional indexes rank & local position in order to handle - // duplicate values - size_t num_local_samples = std::min(domain.get_volume(), volume); - size_t local_rank = index_point[0]; - auto local_samples = std::make_unique[]>(num_local_samples); - for (int i = 0; i < num_local_samples; ++i) { - const size_t index = (i + 1) * volume / num_local_samples - 1; - local_samples[i].value = inptr[index]; - local_samples[i].rank = local_rank; - local_samples[i].local_id = index; + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + std_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + } else { + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } } - // std::cout << "local samples: size = " << num_local_samples << std::endl; - // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<< - // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" << - // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank - // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl; - - // all2all those samples - // TODO broadcast package size - // TODO allocate targets - // TODO broadcast samples - size_t num_global_samples = 15; - std::unique_ptr[]> global_samples(new SampleEntry[num_global_samples]); - - // sort all samples (utilize 2nd and 3rd sort criteria as well) - std::stable_sort(&(global_samples[0]), - &(global_samples[0]) + num_global_samples, - SampleEntryComparator()); - - // define splitters - auto splitters = std::make_unique[]>(domain.get_volume() - 1); - for (int i = 0; i < domain.get_volume() - 1; ++i) { - const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1; - splitters[i] = global_samples[index]; + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + std_sort( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; } - - do { - // compute local package sizes for every process based on splitters - std::unique_ptr local_partition_size(new size_t[domain.get_volume()]); - { - size_t range_start = 0; - size_t local_position = 0; - for (int p_index = 0; p_index < domain.get_volume(); ++p_index) { - // move as long current value is lesser or equaöl to current splitter - while (local_position < volume && - (inptr[local_position] < splitters[p_index].value || - (inptr[local_position] == splitters[p_index].value && - (local_rank < splitters[p_index].rank || - (local_rank == splitters[p_index].rank && - local_position <= splitters[p_index].local_id))))) { - local_position++; - } - - local_partition_size[p_index++] = local_position - range_start; - range_start = local_position; - } - } - - // communicate local package-sizes all2all - // TODO - - // evaluate distribution result?? - // TODO - - // if (good enough) break; - // TODO - break; - // else iterate/improve splitters - // TODO - - } while (true); - - // all2all accepted distribution - // package sizes should already be known - // all2all communication - // TODO - - // final merge sort of received packages - // TODO } } }; diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index f48f9a079..fd864ca78 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -34,29 +34,15 @@ template struct SortImplBody { using VAL = legate_type_of; - void operator()(VAL* inptr, - const Pitches& pitches, - const Rect& rect, - const size_t volume, - const uint32_t sort_axis, - Legion::DomainPoint global_shape, - bool is_index_space, - Legion::DomainPoint index_point, - Legion::Domain domain) + void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) { -#ifdef DEBUG_CUNUMERIC - std::cout << "GPU(" << index_point[0] << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl; -#endif - - const size_t sort_dim_size = global_shape[sort_axis]; - thrust::device_ptr dev_ptr(inptr); - + thrust::device_ptr dev_input_ptr(inptr); + thrust::device_ptr dev_output_ptr(outptr); + thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr); // same approach as cupy implemntation --> combine multiple individual sorts into single // kernel with data tuples - (id_sub-sort, actual_data) if (DIM == 1) { - thrust::stable_sort(dev_ptr, dev_ptr + volume); + thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume); } else { // in this case we know we are sorting for the *last* index const uint64_t max_elements_per_kernel = @@ -93,23 +79,57 @@ struct SortImplBody { // sort auto combined = - thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_ptr + offset)); + thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset)); thrust::stable_sort( combined, combined + num_elements, thrust::less>()); } } else { - // number_sorts_per_kernel == 1 ----> we don't need keys + // number_sorts_per_kernel too small ----> we sort one after another for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { const uint64_t offset = sort_part * sort_dim_size; - thrust::stable_sort(dev_ptr + offset, dev_ptr + offset + sort_dim_size); + thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size); } } } + } + + void operator()(AccessorRO input, + AccessorWO output, + const Pitches& pitches, + const Rect& rect, + const bool dense, + const size_t volume, + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) + { +#ifdef DEBUG_CUNUMERIC + std::cout << "GPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; +#endif + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + thrust_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + } else { + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } + } - // in case of distributed data we need to switch to sample sort - if (is_index_space && DIM == 1) { - // not implemented yet - assert(false); + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + thrust_sort( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; + } } } }; diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index cc90ff21d..837027086 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -21,8 +21,10 @@ namespace cunumeric { struct SortArgs { + const Array& input; Array& output; - uint32_t sort_axis; + bool descending; + bool argsort; Legion::DomainPoint global_shape; bool is_index_space; Legion::DomainPoint index_point; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 659e5c138..c5b3ccd54 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -29,38 +29,59 @@ template struct SortImplBody { using VAL = legate_type_of; - void operator()(VAL* inptr, - const Pitches& pitches, - const Rect& rect, - const size_t volume, - const uint32_t sort_axis, - Legion::DomainPoint global_shape, - bool is_index_space, - Legion::DomainPoint index_point, - Legion::Domain domain) + void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) { -#ifdef DEBUG_CUNUMERIC - std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested() - << "): local size = " << volume << ", dist. = " << is_index_space - << ", index_point = " << index_point << ", domain/volume = " << domain << "/" - << domain.get_volume() << std::endl; -#endif - const size_t sort_dim_size = global_shape[sort_axis]; + std::copy(inptr, inptr + volume, outptr); if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune { #pragma omp do schedule(dynamic) for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); } } else { for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size); + __gnu_parallel::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); } } + } + + void operator()(AccessorRO input, + AccessorWO output, + const Pitches& pitches, + const Rect& rect, + const bool dense, + const size_t volume, + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) + { +#ifdef DEBUG_CUNUMERIC + std::cout << "CPU(" << index_point[0] << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; +#endif + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + std_sort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + } else { + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } + } - if (is_index_space && DIM == 1) { - // not implemented yet - assert(false); + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + std_sort_omp( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; + } } } }; diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 81c020912..5488330a5 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -31,7 +31,7 @@ struct SortImpl { { using VAL = legate_type_of; - auto rect = args.output.shape(); + auto rect = args.input.shape(); // we shall not return on empty rectangle in case of distributed data // as the process might still participate in the parallel sort @@ -40,76 +40,47 @@ struct SortImpl { Pitches pitches; size_t volume = pitches.flatten(rect); - auto inout = args.output.read_write_accessor(rect); + auto input = args.input.read_accessor(rect); + auto output = args.output.write_accessor(rect); /* * Assumptions: * 1. Sort is always requested for the 'last' dimension within rect * 2. We have product_of_all_other_dimensions independent sort ranges * 3. if we have more than one participants: - * a) 1D-case: we need to perform parallel sort (e.g. via sampling) + * a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- not implemented yet * b) ND-case: rect needs to be the full domain in that last dimension * - * FIXME: understand legion-dim != ndarray-dim case - * - * */ #ifdef DEBUG_CUNUMERIC std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape - << ", axis=" << args.sort_axis - << ", sort_dim_size=" << args.global_shape[args.sort_axis] << std::endl; + << ", descending=" << args.descending << ", argsort=" << args.argsort + << ", sort_dim_size=" << args.global_shape[DIM - 1] << std::endl; - assert((DIM == 1 || (rect.hi[args.sort_axis] - rect.lo[args.sort_axis] + 1 == - args.global_shape[args.sort_axis])) && + assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) && "multi-dimensional array should not be distributed in (sort) dimension"); #endif #ifndef LEGION_BOUNDS_CHECKS - bool dense = inout.accessor.is_dense_row_major(rect); + bool dense = + input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); #else bool dense = false; #endif - if (dense) { - SortImplBody()(inout.ptr(rect), - pitches, - rect, - volume, - args.sort_axis, - args.global_shape, - args.is_index_space, - args.index_point, - args.domain); - } else { - // NOTE: we might want to place this loop logic in the different KIND-implementations in - // norder to re-use buffers - - assert(!args.is_index_space || DIM > 1); - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < args.global_shape[i]) { break; } - } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - // RUN based on current start point - SortImplBody()(&(inout[start_point]), - pitches, - rect, - contiguous_elements, - args.sort_axis, - args.global_shape, - args.is_index_space, - args.index_point, - args.domain); - elements_processed += contiguous_elements; - } - } + assert(dense || !args.is_index_space || DIM > 1); + + SortImplBody()(input, + output, + pitches, + rect, + dense, + volume, + args.global_shape, + args.is_index_space, + args.index_point, + args.domain); } }; @@ -118,13 +89,15 @@ static void sort_template(TaskContext& context) { DomainPoint global_shape; { - auto shape_span = context.scalars()[1].values(); + auto shape_span = context.scalars()[2].values(); global_shape.dim = shape_span.size(); for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; } } - SortArgs args{context.outputs()[0], - context.scalars()[0].value(), + SortArgs args{context.inputs()[0], + context.outputs()[0], + context.scalars()[0].value(), + context.scalars()[1].value(), global_shape, context.task_->is_index_space, context.task_->index_point, diff --git a/tests/sort.py b/tests/sort.py index 85f17ed3b..ef78d0447 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -70,11 +70,8 @@ def test_2D(): return -def test_3D(): +def test_3D(x_dim, y_dim, z_dim): np.random.seed(42) - x_dim = 5 - y_dim = 3 - z_dim = 7 A_np = np.array( np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32 ).reshape(x_dim, y_dim, z_dim) @@ -91,7 +88,33 @@ def test_3D(): return +def test_3D_complex(x_dim, y_dim, z_dim): + np.random.seed(42) + A_np = np.array( + np.random.random(size=x_dim * y_dim * z_dim), dtype=np.complex64 + ).reshape(x_dim, y_dim, z_dim) + + A_num = num.array(A_np) + print("Sorting 3d tensor:\n") + print(A_np) + + test_sort_axis(A_np, A_num, 2) + test_sort_axis(A_np, A_num, 1) + test_sort_axis(A_np, A_num, 0) + test_sort_axis(A_np, A_num, axis=None) + + return + + def test_custom(): + # 4D still works, >=5D always falls back to numpy + a = np.arange(4 * 2 * 2 * 4).reshape(4, 2, 2, 4) + a_num = num.array(a) + + test_sort_axis(a, a_num, 1) + test_sort_axis(a, a_num, 2) + test_sort_axis(a, a_num, a.ndim - 1) + a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape( 4, 4, 5, 2, 3, 2, 2, 2, 4 ) @@ -106,15 +129,48 @@ def test_custom(): return +def test_other_api(): + a = np.arange(4 * 2 * 3).reshape(4, 2, 3) + a_num = num.array(a) + + # msort + assert num.allclose(np.msort(a), num.msort(a_num)) + + # sort_complex + assert num.allclose(np.sort_complex(a), num.sort_complex(a_num)) + + # reverse order sort + # TODO + + # in-place sort + copy_a = a.copy() + copy_a_num = a_num.copy() + copy_a.sort() + copy_a_num.sort() + assert num.allclose(copy_a, copy_a_num) + + # reverse order sort (in place) + # TODO + + # argsort + # TODO + + return + + def test(): - print("\n\n ----------- Custom test ---------------\n") - test_custom() - print("\n\n ----------- 2D test ---------------\n") - test_2D() - print("\n\n ----------- 3D test ---------------\n") - test_3D() print("\n\n ----------- 1D test ---------------\n") test_1D() + print("\n\n ----------- 2D test ---------------\n") + test_2D() + print("\n\n ----------- 3D test (int32) -------\n") + test_3D(51, 23, 17) + print("\n\n ----------- 3D test (complex) -----\n") + test_3D_complex(27, 30, 45) + print("\n\n ----------- 4D/5D test-------------\n") + test_custom() + print("\n\n ----------- API test --------------\n") + test_other_api() if __name__ == "__main__": From c9e4407901b6e81e86fa6a2f5770dfc964f040de Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Feb 2022 14:51:26 -0800 Subject: [PATCH 18/49] added argsort support and test coverage --- cunumeric/array.py | 6 + cunumeric/deferred.py | 20 +- cunumeric/eager.py | 9 +- cunumeric/module.py | 7 +- src/cunumeric/sort/sort.cc | 72 ++++- src/cunumeric/sort/sort.cu | 404 +++++++++++++++++++++++---- src/cunumeric/sort/sort.h | 5 +- src/cunumeric/sort/sort_omp.cc | 97 ++++++- src/cunumeric/sort/sort_template.inl | 88 ++++-- tests/sort.py | 106 ++++++- 10 files changed, 696 insertions(+), 118 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 6c5a598f4..924bd9edd 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1484,6 +1484,12 @@ def sort(self, axis=-1, kind="stable", order=None): self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order) return + def argsort(self, axis=-1, kind="stable", order=None): + self._thunk.sort( + rhs=self._thunk, argsort=True, axis=axis, kind=kind, order=order + ) + return + def squeeze(self, axis=None): if axis is not None: if isinstance(axis, int): diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 311753138..5d43f8bfa 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1519,7 +1519,7 @@ def cholesky(self, src, no_tril=False): self.trilu(self, 0, True) @auto_convert([1]) - def sort(self, rhs, axis=-1, kind="stable", order=None): + def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): if kind != "stable": self.runtime.warn( @@ -1545,8 +1545,11 @@ def sort(self, rhs, axis=-1, kind="stable", order=None): flattened_copy.copy(flattened, deep=True) # run sort flattened -- return 1D solution - flattened_copy.sort(flattened_copy) - self.base = flattened_copy.base + sort_result = self.runtime.create_empty_thunk( + flattened_copy.shape, dtype=self.dtype, inputs=[flattened_copy] + ) + sort_result.sort(rhs=flattened_copy, argsort=argsort) + self.base = sort_result.base self.numpy_array = None else: @@ -1569,9 +1572,12 @@ def sort(self, rhs, axis=-1, kind="stable", order=None): swapped_copy.copy(swapped, deep=True) # run sort on last axis - swapped_copy.sort(swapped_copy) + sort_result = self.runtime.create_empty_thunk( + swapped_copy.shape, dtype=self.dtype, inputs=[swapped_copy] + ) + sort_result.sort(rhs=swapped_copy, argsort=argsort) - self.base = swapped_copy.swapaxes(rhs.ndim - 1, sort_axis).base + self.base = sort_result.swapaxes(rhs.ndim - 1, sort_axis).base self.numpy_array = None else: @@ -1586,6 +1592,7 @@ def sort(self, rhs, axis=-1, kind="stable", order=None): ) task = self.context.create_task(CuNumericOpCode.SORT) + task.add_output(self.base) task.add_input(rhs.base) task.add_alignment(self.base, rhs.base) @@ -1595,8 +1602,7 @@ def sort(self, rhs, axis=-1, kind="stable", order=None): # print("Distributed 1D sort --> broadcast") task.add_broadcast(rhs.base) - task.add_scalar_arg(False, bool) # descending flag - task.add_scalar_arg(False, bool) # return indices flag + task.add_scalar_arg(argsort, bool) # return indices flag task.add_scalar_arg(rhs.base.shape, (ty.int32,)) task.execute() diff --git a/cunumeric/eager.py b/cunumeric/eager.py index 520287a6f..f5bc10a96 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -502,12 +502,15 @@ def nonzero(self): result += (EagerArray(self.runtime, array),) return result - def sort(self, rhs, axis=-1, kind="stable", order=None): + def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): self.check_eager_args(rhs, axis, kind, order) if self.deferred is not None: - self.deferred.sort(rhs, axis, kind, order) + self.deferred.sort(rhs, argsort, axis, kind, order) else: - self.array = np.sort(rhs.array, axis, kind, order) + if argsort: + self.array = np.argsort(rhs.array, axis, kind, order) + else: + self.array = np.sort(rhs.array, axis, kind, order) def random_uniform(self): if self.deferred is not None: diff --git a/cunumeric/module.py b/cunumeric/module.py index 596d61b11..d99b4e9c2 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5606,7 +5606,12 @@ def argsort(a, axis=-1, kind="stable", order=None): -------- GPU, CPU """ - return a.argsort(axis=axis, kind=kind, order=order) + + result = ndarray(a.shape, np.int32) + result._thunk.sort( + rhs=a._thunk, argsort=True, axis=axis, kind=kind, order=order + ) + return result def msort(a): diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 44a6ba3e6..8b209827a 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -17,14 +17,16 @@ #include "cunumeric/sort/sort.h" #include "cunumeric/sort/sort_template.inl" +#include + namespace cunumeric { using namespace Legion; using namespace legate; -// general routine +// general routine SORT template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) @@ -41,13 +43,14 @@ struct SortImplBody { const Rect& rect, const bool dense, const size_t volume, + const bool argsort, const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, const Legion::Domain domain) { #ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << index_point[0] << "): local size = " << volume + std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << ", dense = " << dense << std::endl; @@ -76,6 +79,69 @@ struct SortImplBody { } }; +// general routine ARGSORT +template +struct SortImplBody { + using VAL = legate_type_of; + + void std_argsort(const VAL* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) + { + for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentKeys = outptr + start_idx; + const VAL* segmentValues = inptr + start_idx; + std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); + std::stable_sort( + segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { + return segmentValues[i1] < segmentValues[i2]; + }); + } + } + + void operator()(AccessorRO input, + AccessorWO output, + const Pitches& pitches, + const Rect& rect, + const bool dense, + const size_t volume, + const bool argsort, + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) + { +#ifdef DEBUG_CUNUMERIC + std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; +#endif + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + std_argsort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + } else { + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } + } + + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + std_argsort( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; + } + } + } +}; + /*static*/ void SortTask::cpu_variant(TaskContext& context) { sort_template(context); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index fd864ca78..a170a5069 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "cunumeric/cuda_help.h" @@ -30,90 +32,376 @@ namespace cunumeric { using namespace Legion; +struct multiply : public thrust::unary_function { + const int constant; + + multiply(int _constant) : constant(_constant) {} + + __host__ __device__ int operator()(int& input) const { return input * constant; } +}; + +template +void cub_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) +{ + if (volume == sort_dim_size) { + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortKeys(NULL, temp_storage_bytes, inptr, outptr, volume); + + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, inptr, outptr, volume); + } else { + auto off_start_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); + auto off_end_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); + + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortKeys(NULL, + temp_storage_bytes, + inptr, + outptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it); + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0), + temp_storage_bytes, + inptr, + outptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it); + } +} + +template +void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) +{ + thrust::device_ptr dev_input_ptr(inptr); + thrust::device_ptr dev_output_ptr(outptr); + thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr); + // same approach as cupy implemntation --> combine multiple individual sorts into single + // kernel with data tuples - (id_sub-sort, actual_data) + if (volume == sort_dim_size) { + thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume); + } else { + // in this case we know we are sorting for the *last* index + const uint64_t max_elements_per_kernel = + 1 << 22; // TODO check amount of available GPU memory from config + const uint64_t number_sorts_per_kernel = + std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); + const uint64_t number_sorts = volume / sort_dim_size; + + // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; + + if (number_sorts_per_kernel >= + 32) // key-tuple sort has quite some overhead -- only utilize if beneficial + { + // allocate memory for keys (iterating +=1 for each individual sort dimension) + // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! + // TODO!!!! + auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, + Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); + + for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) { + // compute size of batch (might be smaller for the last call) + const uint64_t num_elements = + std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; + const uint64_t offset = sort_part * sort_dim_size; + + // reinit keys + thrust::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_elements), + thrust::make_constant_iterator(sort_dim_size), + dev_key_ptr, + thrust::divides()); + + // sort + auto combined = + thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset)); + thrust::stable_sort( + combined, combined + num_elements, thrust::less>()); + } + } else { + // number_sorts_per_kernel too small ----> we sort one after another + for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { + const uint64_t offset = sort_part * sort_dim_size; + thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size); + } + } + } +} + +template +void cub_argsort(const VAL* inptr, int32_t* outptr, const size_t volume, const size_t sort_dim_size) +{ + auto keys_out = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_key_out_ptr(keys_out.ptr(0)); + + auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_idx_in_ptr(idx_in.ptr(0)); + thrust::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + dev_idx_in_ptr, + thrust::modulus()); + + if (volume == sort_dim_size) { + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + NULL, temp_storage_bytes, inptr, keys_out.ptr(0), idx_in.ptr(0), outptr, volume); + + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0), + temp_storage_bytes, + inptr, + keys_out.ptr(0), + idx_in.ptr(0), + outptr, + volume); + } else { + auto off_start_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); + auto off_end_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); + + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + temp_storage_bytes, + inptr, + keys_out.ptr(0), + idx_in.ptr(0), + outptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it); + + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0), + temp_storage_bytes, + inptr, + keys_out.ptr(0), + idx_in.ptr(0), + outptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it); + } +} + +template +void thrust_argsort(const VAL* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) +{ + thrust::device_ptr dev_input_ptr(inptr); + + auto keys_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_keys_copy_ptr(keys_copy.ptr(0)); + thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_keys_copy_ptr); + + thrust::device_ptr dev_output_ptr(outptr); + thrust::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + dev_output_ptr, + thrust::modulus()); + + // same approach as cupy implemntation --> combine multiple individual sorts into single + // kernel with data tuples - (id_sub-sort, actual_data) + if (volume == sort_dim_size) { + thrust::stable_sort_by_key(dev_keys_copy_ptr, dev_keys_copy_ptr + volume, dev_output_ptr); + } else { + // in this case we know we are sorting for the *last* index + const uint64_t max_elements_per_kernel = + 1 << 22; // TODO check amount of available GPU memory from config + const uint64_t number_sorts_per_kernel = + std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); + const uint64_t number_sorts = volume / sort_dim_size; + + // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; + + if (number_sorts_per_kernel >= + 32) // key-tuple sort has quite some overhead -- only utilize if beneficial + { + // allocate memory for keys (iterating +=1 for each individual sort dimension) + // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! + // TODO!!!! + auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, + Legion::Memory::Kind::GPU_FB_MEM); + thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); + + for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) { + // compute size of batch (might be smaller for the last call) + const uint64_t num_elements = + std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; + const uint64_t offset = sort_part * sort_dim_size; + + // reinit keys + thrust::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_elements), + thrust::make_constant_iterator(sort_dim_size), + dev_key_ptr, + thrust::divides()); + + // sort + auto combined = + thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_keys_copy_ptr + offset)); + thrust::stable_sort_by_key(combined, + combined + num_elements, + dev_output_ptr + offset, + thrust::less>()); + } + } else { + // number_sorts_per_kernel too small ----> we sort one after another + for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { + const uint64_t offset = sort_part * sort_dim_size; + thrust::stable_sort_by_key(dev_keys_copy_ptr + offset, + dev_keys_copy_ptr + offset + sort_dim_size, + dev_output_ptr + offset); + } + } + } +} + +template +struct support_cub : std::true_type { +}; +template <> +struct support_cub : std::false_type { +}; +template <> +struct support_cub : std::false_type { +}; + +template ::value>* = nullptr> +void sort_stable(const legate_type_of* inptr, + legate_type_of* outptr, + const size_t volume, + const size_t sort_dim_size) +{ + using VAL = legate_type_of; + cub_sort(inptr, outptr, volume, sort_dim_size); +} + +template ::value>* = nullptr> +void sort_stable(const legate_type_of* inptr, + legate_type_of* outptr, + const size_t volume, + const size_t sort_dim_size) +{ + using VAL = legate_type_of; + thrust_sort(inptr, outptr, volume, sort_dim_size); +} + +template ::value>* = nullptr> +void argsort_stable(const legate_type_of* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) +{ + using VAL = legate_type_of; + cub_argsort(inptr, outptr, volume, sort_dim_size); +} + +template ::value>* = nullptr> +void argsort_stable(const legate_type_of* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) +{ + using VAL = legate_type_of; + thrust_argsort(inptr, outptr, volume, sort_dim_size); +} + template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; - void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) + void operator()(AccessorRO input, + AccessorWO output, + const Pitches& pitches, + const Rect& rect, + const bool dense, + const size_t volume, + const bool argsort, + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) { - thrust::device_ptr dev_input_ptr(inptr); - thrust::device_ptr dev_output_ptr(outptr); - thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr); - // same approach as cupy implemntation --> combine multiple individual sorts into single - // kernel with data tuples - (id_sub-sort, actual_data) - if (DIM == 1) { - thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume); +#ifdef DEBUG_CUNUMERIC + std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; +#endif + assert(!argsort); + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + sort_stable(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); } else { - // in this case we know we are sorting for the *last* index - const uint64_t max_elements_per_kernel = - 1 << 22; // TODO check amount of available GPU memory from config - const uint64_t number_sorts_per_kernel = - std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); - const uint64_t number_sorts = volume / sort_dim_size; - - // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; - - if (number_sorts_per_kernel >= - 32) // key-tuple sort has quite some overhead -- only utilize if beneficial - { - // allocate memory for keys (iterating +=1 for each individual sort dimension) - // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! - // TODO!!!! - auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, - Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); - - for (uint64_t sort_part = 0; sort_part < number_sorts; - sort_part += number_sorts_per_kernel) { - // compute size of batch (might be smaller for the last call) - const uint64_t num_elements = - std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; - const uint64_t offset = sort_part * sort_dim_size; - - // reinit keys - thrust::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_elements), - thrust::make_constant_iterator(sort_dim_size), - dev_key_ptr, - thrust::divides()); - - // sort - auto combined = - thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset)); - thrust::stable_sort( - combined, combined + num_elements, thrust::less>()); - } - } else { - // number_sorts_per_kernel too small ----> we sort one after another - for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { - const uint64_t offset = sort_part * sort_dim_size; - thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size); - } + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } + } + + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + sort_stable( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; } } } +}; + +template +struct SortImplBody { + using VAL = legate_type_of; void operator()(AccessorRO input, - AccessorWO output, + AccessorWO output, const Pitches& pitches, const Rect& rect, const bool dense, const size_t volume, + const bool argsort, const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, const Legion::Domain domain) { #ifdef DEBUG_CUNUMERIC - std::cout << "GPU(" << index_point[0] << "): local size = " << volume + std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << ", dense = " << dense << std::endl; #endif + assert(argsort); const size_t sort_dim_size = global_shape[DIM - 1]; assert(!is_index_space || DIM > 1); // not implemented for now if (dense) { - thrust_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + argsort_stable(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); } else { // compute contiguous memory block int contiguous_elements = 1; @@ -126,7 +414,7 @@ struct SortImplBody { uint64_t elements_processed = 0; while (elements_processed < volume) { Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - thrust_sort( + argsort_stable( input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); elements_processed += contiguous_elements; } diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index 837027086..b915df838 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -23,12 +23,11 @@ namespace cunumeric { struct SortArgs { const Array& input; Array& output; - bool descending; bool argsort; Legion::DomainPoint global_shape; bool is_index_space; - Legion::DomainPoint index_point; - Legion::Domain domain; + Legion::DomainPoint task_index; + Legion::Domain launch_domain; }; template diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index c5b3ccd54..1fc560617 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -26,7 +26,7 @@ using namespace Legion; using namespace legate; template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) @@ -51,13 +51,14 @@ struct SortImplBody { const Rect& rect, const bool dense, const size_t volume, + const bool argsort, const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, const Legion::Domain domain) { #ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << index_point[0] << "): local size = " << volume + std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() << ", dense = " << dense << std::endl; @@ -86,6 +87,98 @@ struct SortImplBody { } }; +template +struct SortImplBody { + using VAL = legate_type_of; + + void std_argsort(const VAL* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) + { + for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentKeys = outptr + start_idx; + const VAL* segmentValues = inptr + start_idx; + std::iota(outptr + start_idx, outptr + start_idx + sort_dim_size, 0); + std::stable_sort( + segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { + return segmentValues[i1] < segmentValues[i2]; + }); + } + } + + void std_argsort_omp(const VAL* inptr, + int32_t* outptr, + const size_t volume, + const size_t sort_dim_size) + { + if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune + { +#pragma omp do schedule(dynamic) + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentKeys = outptr + start_idx; + const VAL* segmentValues = inptr + start_idx; + std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); + std::stable_sort( + segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { + return segmentValues[i1] < segmentValues[i2]; + }); + } + } else { + for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentKeys = outptr + start_idx; + const VAL* segmentValues = inptr + start_idx; + std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); + __gnu_parallel::stable_sort( + segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { + return segmentValues[i1] < segmentValues[i2]; + }); + } + } + } + + void operator()(AccessorRO input, + AccessorWO output, + const Pitches& pitches, + const Rect& rect, + const bool dense, + const size_t volume, + const bool argsort, + const Legion::DomainPoint global_shape, + const bool is_index_space, + const Legion::DomainPoint index_point, + const Legion::Domain domain) + { +#ifdef DEBUG_CUNUMERIC + std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume + << ", dist. = " << is_index_space << ", index_point = " << index_point + << ", domain/volume = " << domain << "/" << domain.get_volume() + << ", dense = " << dense << std::endl; +#endif + const size_t sort_dim_size = global_shape[DIM - 1]; + assert(!is_index_space || DIM > 1); // not implemented for now + if (dense) { + std_argsort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + } else { + // compute contiguous memory block + int contiguous_elements = 1; + for (int i = DIM - 1; i >= 0; i--) { + auto diff = 1 + rect.hi[i] - rect.lo[i]; + contiguous_elements *= diff; + if (diff < global_shape[i]) { break; } + } + + uint64_t elements_processed = 0; + while (elements_processed < volume) { + Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); + std_argsort_omp( + input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); + elements_processed += contiguous_elements; + } + } + } +}; + /*static*/ void SortTask::omp_variant(TaskContext& context) { sort_template(context); diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 5488330a5..c00b4a7e8 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -21,9 +21,19 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template +template struct SortImplBody; +static int getRank(Domain domain, DomainPoint index_point) +{ + int domain_index = 0; + for (int i = 0; i < domain.get_dim(); ++i) { + if (i > 0) domain_index *= domain.hi()[i] - domain.lo()[i] + 1; + domain_index += index_point[i]; + } + return domain_index; +} + template struct SortImpl { template @@ -40,9 +50,6 @@ struct SortImpl { Pitches pitches; size_t volume = pitches.flatten(rect); - auto input = args.input.read_accessor(rect); - auto output = args.output.write_accessor(rect); - /* * Assumptions: * 1. Sort is always requested for the 'last' dimension within rect @@ -55,32 +62,60 @@ struct SortImpl { #ifdef DEBUG_CUNUMERIC std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape - << ", descending=" << args.descending << ", argsort=" << args.argsort - << ", sort_dim_size=" << args.global_shape[DIM - 1] << std::endl; + << ", argsort=" << args.argsort << ", sort_dim_size=" << args.global_shape[DIM - 1] + << std::endl; assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) && "multi-dimensional array should not be distributed in (sort) dimension"); #endif + auto input = args.input.read_accessor(rect); + + if (args.argsort) { + auto output = args.output.write_accessor(rect); + #ifndef LEGION_BOUNDS_CHECKS - bool dense = - input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); + bool dense = + input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); #else - bool dense = false; + bool dense = false; #endif + assert(dense || !args.is_index_space || DIM > 1); + + SortImplBody()(input, + output, + pitches, + rect, + dense, + volume, + args.argsort, + args.global_shape, + args.is_index_space, + args.task_index, + args.launch_domain); + + } else { + auto output = args.output.write_accessor(rect); - assert(dense || !args.is_index_space || DIM > 1); - - SortImplBody()(input, - output, - pitches, - rect, - dense, - volume, - args.global_shape, - args.is_index_space, - args.index_point, - args.domain); +#ifndef LEGION_BOUNDS_CHECKS + bool dense = + input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); +#else + bool dense = false; +#endif + assert(dense || !args.is_index_space || DIM > 1); + SortImplBody()(input, + output, + pitches, + rect, + dense, + volume, + args.argsort, + args.global_shape, + args.is_index_space, + args.task_index, + args.launch_domain); + } } }; @@ -89,7 +124,7 @@ static void sort_template(TaskContext& context) { DomainPoint global_shape; { - auto shape_span = context.scalars()[2].values(); + auto shape_span = context.scalars()[1].values(); global_shape.dim = shape_span.size(); for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; } } @@ -97,12 +132,11 @@ static void sort_template(TaskContext& context) SortArgs args{context.inputs()[0], context.outputs()[0], context.scalars()[0].value(), - context.scalars()[1].value(), global_shape, - context.task_->is_index_space, - context.task_->index_point, - context.task_->index_domain}; - double_dispatch(args.output.dim(), args.output.code(), SortImpl{}, args); + !context.is_single_task(), + context.get_task_index(), + context.get_launch_domain()}; + double_dispatch(args.input.dim(), args.input.code(), SortImpl{}, args); } } // namespace cunumeric diff --git a/tests/sort.py b/tests/sort.py index ef78d0447..24385b06a 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -18,15 +18,24 @@ import cunumeric as num +def compare_assert(a_np, a_num): + if not num.allclose(a_np, a_num): + print("numpy:") + print(a_np) + print("cuNumeric:") + print(a_num) + assert False + + def test_sort_axis(a_np, a_num, axis): - assert num.allclose(a_np, a_num) + compare_assert(a_np, a_num) print("Sorting axis " + str(axis) + ":") sort_np = np.sort(a_np, axis) sort_num = num.sort(a_num, axis) - if not num.allclose(sort_np, sort_num): - print(sort_np) - print(sort_num) - assert False + compare_assert(sort_np, sort_num) + argsort_np = np.sort(a_np, axis) + argsort_num = num.sort(a_num, axis) + compare_assert(argsort_np, argsort_num) def test_1D(): @@ -42,11 +51,11 @@ def test_1D(): # pdb.set_trace() sortA_num = num.sort(A_num) print("Result cunumeric: " + str(sortA_num)) - assert num.allclose(sortA_np, sortA_num) + compare_assert(sortA_np, sortA_num) A_num.sort() print("Result (inplace): " + str(A_num)) - assert num.allclose(sortA_np, A_num) + compare_assert(sortA_np, A_num) return @@ -129,15 +138,25 @@ def test_custom(): return -def test_other_api(): - a = np.arange(4 * 2 * 3).reshape(4, 2, 3) +def test_api(a=None): + if a is None: + a = np.arange(4 * 2 * 3).reshape(4, 2, 3) a_num = num.array(a) + # sort axes + for i in range(a.ndim): + compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i)) + + # flatten + compare_assert( + np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None) + ) + # msort - assert num.allclose(np.msort(a), num.msort(a_num)) + compare_assert(np.msort(a), num.msort(a_num)) # sort_complex - assert num.allclose(np.sort_complex(a), num.sort_complex(a_num)) + compare_assert(np.sort_complex(a), num.sort_complex(a_num)) # reverse order sort # TODO @@ -147,13 +166,70 @@ def test_other_api(): copy_a_num = a_num.copy() copy_a.sort() copy_a_num.sort() - assert num.allclose(copy_a, copy_a_num) + compare_assert(copy_a, copy_a_num) # reverse order sort (in place) # TODO # argsort - # TODO + for i in range(a.ndim): + compare_assert(a, a_num) + compare_assert( + np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i) + ) + + # flatten + compare_assert( + np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None) + ) + + return + + +def generate_random(shape, datatype): + print("Generate random for " + str(datatype)) + a_np = None + volume = 1 + for i in shape: + volume *= i + + if np.issubdtype(datatype, np.integer): + a_np = np.array( + np.random.randint( + np.iinfo(datatype).min, np.iinfo(datatype).max, size=volume + ), + dtype=datatype, + ) + elif np.issubdtype(datatype, np.floating): + a_np = np.array(np.random.random(size=volume), dtype=datatype) + elif np.issubdtype(datatype, np.complexfloating): + a_np = np.array( + np.random.random(size=volume) + np.random.random(size=volume) * 1j, + dtype=datatype, + ) + else: + print("UNKNOWN type " + str(datatype)) + assert False + return a_np + + +def test_dtypes(): + np.random.seed(42) + test_api(generate_random((2, 5, 7), np.uint8)) + test_api(generate_random((8, 5), np.uint16)) + test_api(generate_random((22, 5, 7), np.uint32)) + + test_api(generate_random((2, 5, 7), np.int8)) + test_api(generate_random((8, 5), np.int16)) + test_api(generate_random((22, 5, 7), np.int32)) + test_api(generate_random((2, 5, 7), np.int64)) + + test_api(generate_random((8, 5), np.float32)) + test_api(generate_random((8, 5), np.float64)) + test_api(generate_random((22, 5, 7), np.double)) + + test_api(generate_random((2, 5, 7), np.complex64)) + test_api(generate_random((2, 5, 7), np.complex128)) return @@ -170,7 +246,9 @@ def test(): print("\n\n ----------- 4D/5D test-------------\n") test_custom() print("\n\n ----------- API test --------------\n") - test_other_api() + test_api() + print("\n\n ----------- dtype test ------------\n") + test_dtypes() if __name__ == "__main__": From fd0d3f8fb7ec368d7bd07520bba41ecf4c51ab36 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Feb 2022 15:12:45 -0800 Subject: [PATCH 19/49] adjusted docstring --- cunumeric/module.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 88e78164e..7b938f078 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5610,13 +5610,18 @@ def argsort(a, axis=-1, kind="stable", order=None): Array of indices that sort a along the specified axis. It has the same shape as `a.shape` or is flattened in case of `axis` is None. + Notes + ----- + The current implementation has only limited support for distributed data. + Distributed 1-D or flattened data will be broadcasted. + See Also -------- numpy.argsort Availability -------- - GPU, CPU + Single GPU, Single CPU """ result = ndarray(a.shape, np.int32) @@ -5641,13 +5646,18 @@ def msort(a): out : ndarray Sorted array with same dtype and shape as `a`. + Notes + ----- + The current implementation has only limited support for distributed data. + Distributed 1-D data will be broadcasted. + See Also -------- numpy.msort Availability -------- - GPU, CPU + Single GPU, Single CPU """ return sort(a, axis=0) @@ -5676,13 +5686,18 @@ def sort(a, axis=-1, kind="stable", order=None): Sorted array with same dtype and shape as `a`. In case `axis` is None the result is flattened. + Notes + ----- + The current implementation has only limited support for distributed data. + Distributed 1-D or flattened data will be broadcasted. + See Also -------- numpy.sort Availability -------- - GPU, CPU + Single GPU, Single CPU """ result = ndarray(a.shape, a.dtype) result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order) @@ -5706,13 +5721,18 @@ def sort_complex(a): out : ndarray, complex Sorted array with same shape as `a`. + Notes + ----- + The current implementation has only limited support for distributed data. + Distributed 1-D data will be broadcasted. + See Also -------- numpy.sort_complex Availability -------- - GPU, CPU + Single GPU, Single CPU """ # force complex result From 6c385dd2dfdaba135357768da02f7a446341acac Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Feb 2022 15:38:44 -0800 Subject: [PATCH 20/49] extract messy code from deferred --- cunumeric/deferred.py | 75 +------------------------ cunumeric/sorting/__init__.py | 16 ++++++ cunumeric/sorting/sorting.py | 102 ++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 73 deletions(-) create mode 100644 cunumeric/sorting/__init__.py create mode 100644 cunumeric/sorting/sorting.py diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 5d43f8bfa..15fdbd6ac 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -32,6 +32,7 @@ UnaryRedCode, ) from .linalg.cholesky import cholesky +from .sorting.sorting import sorting from .thunk import NumPyThunk from .utils import get_arg_value_dtype @@ -1537,76 +1538,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim): raise ValueError("invalid axis") - if axis is None and rhs.ndim > 1: - flattened = rhs.reshape((rhs.size,), order="C") - flattened_copy = self.runtime.create_empty_thunk( - flattened.shape, dtype=rhs.dtype, inputs=[rhs, flattened] - ) - flattened_copy.copy(flattened, deep=True) - - # run sort flattened -- return 1D solution - sort_result = self.runtime.create_empty_thunk( - flattened_copy.shape, dtype=self.dtype, inputs=[flattened_copy] - ) - sort_result.sort(rhs=flattened_copy, argsort=argsort) - self.base = sort_result.base - self.numpy_array = None - - else: - if axis is None: - sort_axis = 0 - elif axis < 0: - sort_axis = rhs.ndim + axis - else: - sort_axis = axis - - if sort_axis is not rhs.ndim - 1: - assert sort_axis < rhs.ndim - 1 and sort_axis >= 0 - - # swap axes - swapped = rhs.swapaxes(sort_axis, rhs.ndim - 1) - - swapped_copy = self.runtime.create_empty_thunk( - swapped.shape, dtype=rhs.dtype, inputs=[rhs, swapped] - ) - swapped_copy.copy(swapped, deep=True) - - # run sort on last axis - sort_result = self.runtime.create_empty_thunk( - swapped_copy.shape, dtype=self.dtype, inputs=[swapped_copy] - ) - sort_result.sort(rhs=swapped_copy, argsort=argsort) - - self.base = sort_result.swapaxes(rhs.ndim - 1, sort_axis).base - self.numpy_array = None - - else: - # run actual sort task - needs_communication = self.runtime.num_gpus > 1 or ( - self.runtime.num_gpus == 0 and self.runtime.num_procs > 1 - ) - - if needs_communication: - self.runtime.legate_runtime.issue_execution_fence( - block=True - ) - - task = self.context.create_task(CuNumericOpCode.SORT) - - task.add_output(self.base) - task.add_input(rhs.base) - task.add_alignment(self.base, rhs.base) - if self.ndim > 1: - task.add_broadcast(rhs.base, rhs.ndim - 1) - elif needs_communication: - # print("Distributed 1D sort --> broadcast") - task.add_broadcast(rhs.base) - - task.add_scalar_arg(argsort, bool) # return indices flag - task.add_scalar_arg(rhs.base.shape, (ty.int32,)) - task.execute() - - if needs_communication: - self.runtime.legate_runtime.issue_execution_fence( - block=True - ) + sorting(self, rhs, argsort, axis) diff --git a/cunumeric/sorting/__init__.py b/cunumeric/sorting/__init__.py new file mode 100644 index 000000000..8988b3353 --- /dev/null +++ b/cunumeric/sorting/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys as _sys diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting/sorting.py new file mode 100644 index 000000000..246b3abe3 --- /dev/null +++ b/cunumeric/sorting/sorting.py @@ -0,0 +1,102 @@ +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from cunumeric.config import CuNumericOpCode + +from legate.core import types as ty + + +def sort_flattened(output, input, argsort): + flattened = input.reshape((input.size,), order="C") + flattened_copy = output.runtime.create_empty_thunk( + flattened.shape, dtype=input.dtype, inputs=[input, flattened] + ) + flattened_copy.copy(flattened, deep=True) + + # run sort flattened -- return 1D solution + sort_result = output.runtime.create_empty_thunk( + flattened_copy.shape, dtype=output.dtype, inputs=[flattened_copy] + ) + sorting(sort_result, flattened_copy, argsort) + output.base = sort_result.base + output.numpy_array = None + + +def sort_swapped(output, input, argsort, sort_axis): + assert sort_axis < input.ndim - 1 and sort_axis >= 0 + + # swap axes + swapped = input.swapaxes(sort_axis, input.ndim - 1) + + swapped_copy = output.runtime.create_empty_thunk( + swapped.shape, dtype=input.dtype, inputs=[input, swapped] + ) + swapped_copy.copy(swapped, deep=True) + + # run sort on last axis + sort_result = output.runtime.create_empty_thunk( + swapped_copy.shape, dtype=output.dtype, inputs=[swapped_copy] + ) + sorting(sort_result, swapped_copy, argsort) + + output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base + output.numpy_array = None + + +def sort_task(output, input, argsort): + needs_communication = output.runtime.num_gpus > 1 or ( + output.runtime.num_gpus == 0 and output.runtime.num_procs > 1 + ) + + if needs_communication: + output.runtime.legate_runtime.issue_execution_fence(block=True) + + task = output.context.create_task(CuNumericOpCode.SORT) + + task.add_output(output.base) + task.add_input(input.base) + task.add_alignment(output.base, input.base) + if output.ndim > 1: + task.add_broadcast(input.base, input.ndim - 1) + elif needs_communication: + # print("Distributed 1D sort --> broadcast") + task.add_broadcast(input.base) + + task.add_scalar_arg(argsort, bool) # return indices flag + task.add_scalar_arg(input.base.shape, (ty.int32,)) + task.execute() + + if needs_communication: + output.runtime.legate_runtime.issue_execution_fence(block=True) + + +def sorting(output, input, argsort, axis=-1): + if axis is None and input.ndim > 1: + sort_flattened(output, input, argsort) + else: + if axis is None: + sort_axis = 0 + elif axis < 0: + sort_axis = input.ndim + axis + else: + sort_axis = axis + + if sort_axis is not input.ndim - 1: + sort_swapped(output, input, argsort, sort_axis) + + else: + # run actual sort task + sort_task(output, input, argsort) From 49c3f3bd99abab23a0b9b9643bc3ecba8efc8d4e Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 25 Feb 2022 09:08:53 -0800 Subject: [PATCH 21/49] refactor sort c-code, simplify, reduce duplicated code --- src/cunumeric/sort/sort.cc | 148 +++---- src/cunumeric/sort/sort.cu | 583 ++++++++++++--------------- src/cunumeric/sort/sort_omp.cc | 184 ++++----- src/cunumeric/sort/sort_template.inl | 59 +-- tests/sort.py | 3 + 5 files changed, 417 insertions(+), 560 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 8b209827a..30bcd4592 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -17,6 +17,9 @@ #include "cunumeric/sort/sort.h" #include "cunumeric/sort/sort_template.inl" +#include +#include + #include namespace cunumeric { @@ -24,24 +27,37 @@ namespace cunumeric { using namespace Legion; using namespace legate; -// general routine SORT template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; - void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) + // sorts inptr in-place, if argptr not nullptr it returns sort indices + void thrust_local_sort_inplace(VAL* inptr, + int32_t* argptr, + const size_t volume, + const size_t sort_dim_size) { - std::copy(inptr, inptr + volume, outptr); - for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); + if (argptr == nullptr) { + // sort (in place) + for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); + } + } else { + // argsort + for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentValues = argptr + start_idx; + VAL* segmentKeys = inptr + start_idx; + std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init + thrust::stable_sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + } } } - void operator()(AccessorRO input, - AccessorWO output, + void operator()(const Array& input_array, + Array& output_array, const Pitches& pitches, const Rect& rect, - const bool dense, const size_t volume, const bool argsort, const Legion::DomainPoint global_shape, @@ -49,94 +65,64 @@ struct SortImplBody { const Legion::DomainPoint index_point, const Legion::Domain domain) { + AccessorRO input = input_array.read_accessor(rect); + + bool dense = input.accessor.is_dense_row_major(rect); + #ifdef DEBUG_CUNUMERIC std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; + << ", dense = " << dense << ", argsort. = " << argsort << std::endl; #endif + const size_t sort_dim_size = global_shape[DIM - 1]; assert(!is_index_space || DIM > 1); // not implemented for now + + // make a copy of the input + auto dense_input_copy = create_buffer(volume); if (dense) { - std_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, dense_input_copy.ptr(0)); } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } - } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - std_sort( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; + auto* target = dense_input_copy.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + target[offset] = input[rect.lo + point]; } } - } -}; -// general routine ARGSORT -template -struct SortImplBody { - using VAL = legate_type_of; + // we need a buffer for argsort + auto indices_buffer = create_buffer(argsort ? volume : 0); - void std_argsort(const VAL* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) - { - for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentKeys = outptr + start_idx; - const VAL* segmentValues = inptr + start_idx; - std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); - std::stable_sort( - segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { - return segmentValues[i1] < segmentValues[i2]; - }); - } - } + // sort data + thrust_local_sort_inplace( + dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); - void operator()(AccessorRO input, - AccessorWO output, - const Pitches& pitches, - const Rect& rect, - const bool dense, - const size_t volume, - const bool argsort, - const Legion::DomainPoint global_shape, - const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain) - { -#ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; -#endif - const size_t sort_dim_size = global_shape[DIM - 1]; - assert(!is_index_space || DIM > 1); // not implemented for now + // copy back data (we assume output partition to be aliged to input!) if (dense) { - std_argsort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); - } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); + } else { + AccessorWO output = output_array.write_accessor(rect); + std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - std_argsort( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; + } else { + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + auto* source = indices_buffer.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + output[rect.lo + point] = source[offset]; + } + } else { + AccessorWO output = output_array.write_accessor(rect); + auto* source = dense_input_copy.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + output[rect.lo + point] = source[offset]; + } } } } diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index a170a5069..49edec9e1 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -32,6 +32,34 @@ namespace cunumeric { using namespace Legion; +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + copy_into_buffer(VAL* out, + const AccessorRO accessor, + const Point lo, + const Pitches pitches, + const size_t volume) +{ + size_t offset = blockIdx.x * blockDim.x + threadIdx.x; + if (offset >= volume) return; + auto point = pitches.unflatten(offset, lo); + out[offset] = accessor[lo + point]; +} + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + copy_into_output(AccessorWO accessor, + const VAL* data, + const Point lo, + const Pitches pitches, + const size_t volume) +{ + size_t offset = blockIdx.x * blockDim.x + threadIdx.x; + if (offset >= volume) return; + auto point = pitches.unflatten(offset, lo); + accessor[lo + point] = data[offset]; +} + struct multiply : public thrust::unary_function { const int constant; @@ -41,239 +69,187 @@ struct multiply : public thrust::unary_function { }; template -void cub_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) -{ - if (volume == sort_dim_size) { - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortKeys(NULL, temp_storage_bytes, inptr, outptr, volume); - - auto temp_storage = - create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); - - cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, inptr, outptr, volume); - } else { - auto off_start_it = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); - auto off_end_it = - thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); - - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortKeys(NULL, - temp_storage_bytes, - inptr, - outptr, - volume, - volume / sort_dim_size, - off_start_it, - off_end_it); - auto temp_storage = - create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); - - cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0), - temp_storage_bytes, - inptr, - outptr, - volume, - volume / sort_dim_size, - off_start_it, - off_end_it); - } -} - -template -void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) +void cub_local_sort_inplace( + VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) { - thrust::device_ptr dev_input_ptr(inptr); - thrust::device_ptr dev_output_ptr(outptr); - thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr); - // same approach as cupy implemntation --> combine multiple individual sorts into single - // kernel with data tuples - (id_sub-sort, actual_data) - if (volume == sort_dim_size) { - thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume); - } else { - // in this case we know we are sorting for the *last* index - const uint64_t max_elements_per_kernel = - 1 << 22; // TODO check amount of available GPU memory from config - const uint64_t number_sorts_per_kernel = - std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); - const uint64_t number_sorts = volume / sort_dim_size; - - // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; - - if (number_sorts_per_kernel >= - 32) // key-tuple sort has quite some overhead -- only utilize if beneficial - { - // allocate memory for keys (iterating +=1 for each individual sort dimension) - // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! - // TODO!!!! - auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, - Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); - - for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) { - // compute size of batch (might be smaller for the last call) - const uint64_t num_elements = - std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; - const uint64_t offset = sort_part * sort_dim_size; - - // reinit keys - thrust::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_elements), - thrust::make_constant_iterator(sort_dim_size), - dev_key_ptr, - thrust::divides()); - - // sort - auto combined = - thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset)); - thrust::stable_sort( - combined, combined + num_elements, thrust::less>()); - } + // make a copy of input --> we want inptr to return sorted values + auto keys_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream); + size_t temp_storage_bytes = 0; + if (argptr == nullptr) { + if (volume == sort_dim_size) { + // sort + cub::DeviceRadixSort::SortKeys( + NULL, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream); + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), + temp_storage_bytes, + keys_in.ptr(0), + inptr, + volume, + 0, + sizeof(VAL) * 8, + stream); } else { - // number_sorts_per_kernel too small ----> we sort one after another - for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { - const uint64_t offset = sort_part * sort_dim_size; - thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size); - } + // segmented sort + auto off_start_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); + auto off_end_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); + + cub::DeviceSegmentedRadixSort::SortKeys(NULL, + temp_storage_bytes, + keys_in.ptr(0), + inptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it, + 0, + sizeof(VAL) * 8, + stream); + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0), + temp_storage_bytes, + keys_in.ptr(0), + inptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it, + 0, + sizeof(VAL) * 8, + stream); } - } -} - -template -void cub_argsort(const VAL* inptr, int32_t* outptr, const size_t volume, const size_t sort_dim_size) -{ - auto keys_out = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_key_out_ptr(keys_out.ptr(0)); - - auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_idx_in_ptr(idx_in.ptr(0)); - thrust::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), - dev_idx_in_ptr, - thrust::modulus()); - - if (volume == sort_dim_size) { - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, inptr, keys_out.ptr(0), idx_in.ptr(0), outptr, volume); - - auto temp_storage = - create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); - - cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0), - temp_storage_bytes, - inptr, - keys_out.ptr(0), - idx_in.ptr(0), - outptr, - volume); } else { - auto off_start_it = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); - auto off_end_it = - thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); - - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortPairs(NULL, - temp_storage_bytes, - inptr, - keys_out.ptr(0), - idx_in.ptr(0), - outptr, - volume, - volume / sort_dim_size, - off_start_it, - off_end_it); - - auto temp_storage = - create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); - - cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0), - temp_storage_bytes, - inptr, - keys_out.ptr(0), - idx_in.ptr(0), - outptr, - volume, - volume / sort_dim_size, - off_start_it, - off_end_it); + auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + thrust::transform(thrust::cuda::par.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + idx_in.ptr(0), + thrust::modulus()); + + if (volume == sort_dim_size) { + // argsort + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + keys_in.ptr(0), + inptr, + idx_in.ptr(0), + argptr, + volume, + 0, + sizeof(VAL) * 8, + stream); + + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0), + temp_storage_bytes, + keys_in.ptr(0), + inptr, + idx_in.ptr(0), + argptr, + volume, + 0, + sizeof(VAL) * 8, + stream); + } else { + // segmented argsort + auto off_start_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); + auto off_end_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); + + cub::DeviceSegmentedRadixSort::SortPairs(NULL, + temp_storage_bytes, + keys_in.ptr(0), + inptr, + idx_in.ptr(0), + argptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it, + 0, + sizeof(VAL) * 8, + stream); + + auto temp_storage = + create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); + + cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0), + temp_storage_bytes, + keys_in.ptr(0), + inptr, + idx_in.ptr(0), + argptr, + volume, + volume / sort_dim_size, + off_start_it, + off_end_it, + 0, + sizeof(VAL) * 8, + stream); + } } } template -void thrust_argsort(const VAL* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) +void thrust_local_sort_inplace( + VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) { - thrust::device_ptr dev_input_ptr(inptr); - - auto keys_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_keys_copy_ptr(keys_copy.ptr(0)); - thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_keys_copy_ptr); - - thrust::device_ptr dev_output_ptr(outptr); - thrust::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), - dev_output_ptr, - thrust::modulus()); - - // same approach as cupy implemntation --> combine multiple individual sorts into single - // kernel with data tuples - (id_sub-sort, actual_data) - if (volume == sort_dim_size) { - thrust::stable_sort_by_key(dev_keys_copy_ptr, dev_keys_copy_ptr + volume, dev_output_ptr); + if (argptr == nullptr) { + if (volume == sort_dim_size) { + thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume); + } else { + auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + // init combined keys + thrust::transform(thrust::cuda::par.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + sort_id.ptr(0), + thrust::divides()); + auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); + + thrust::stable_sort(thrust::cuda::par.on(stream), + combined, + combined + volume, + thrust::less>()); + } } else { - // in this case we know we are sorting for the *last* index - const uint64_t max_elements_per_kernel = - 1 << 22; // TODO check amount of available GPU memory from config - const uint64_t number_sorts_per_kernel = - std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size); - const uint64_t number_sorts = volume / sort_dim_size; - - // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl; - - if (number_sorts_per_kernel >= - 32) // key-tuple sort has quite some overhead -- only utilize if beneficial - { - // allocate memory for keys (iterating +=1 for each individual sort dimension) - // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)! - // TODO!!!! - auto keys_array = create_buffer(number_sorts_per_kernel * sort_dim_size, - Legion::Memory::Kind::GPU_FB_MEM); - thrust::device_ptr dev_key_ptr(keys_array.ptr(0)); - - for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) { - // compute size of batch (might be smaller for the last call) - const uint64_t num_elements = - std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size; - const uint64_t offset = sort_part * sort_dim_size; - - // reinit keys - thrust::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_elements), - thrust::make_constant_iterator(sort_dim_size), - dev_key_ptr, - thrust::divides()); - - // sort - auto combined = - thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_keys_copy_ptr + offset)); - thrust::stable_sort_by_key(combined, - combined + num_elements, - dev_output_ptr + offset, - thrust::less>()); - } + // intialize indices + thrust::transform(thrust::cuda::par.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + argptr, + thrust::modulus()); + + if (volume == sort_dim_size) { + thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); } else { - // number_sorts_per_kernel too small ----> we sort one after another - for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) { - const uint64_t offset = sort_part * sort_dim_size; - thrust::stable_sort_by_key(dev_keys_copy_ptr + offset, - dev_keys_copy_ptr + offset + sort_dim_size, - dev_output_ptr + offset); - } + auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + // init combined keys + thrust::transform(thrust::cuda::par.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + sort_id.ptr(0), + thrust::divides()); + auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); + + thrust::stable_sort_by_key(thrust::cuda::par.on(stream), + combined, + combined + volume, + argptr, + thrust::less>()); } } } @@ -289,54 +265,35 @@ struct support_cub : std::false_type { }; template ::value>* = nullptr> -void sort_stable(const legate_type_of* inptr, - legate_type_of* outptr, - const size_t volume, - const size_t sort_dim_size) -{ - using VAL = legate_type_of; - cub_sort(inptr, outptr, volume, sort_dim_size); -} - -template ::value>* = nullptr> -void sort_stable(const legate_type_of* inptr, - legate_type_of* outptr, - const size_t volume, - const size_t sort_dim_size) -{ - using VAL = legate_type_of; - thrust_sort(inptr, outptr, volume, sort_dim_size); -} - -template ::value>* = nullptr> -void argsort_stable(const legate_type_of* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) +void local_sort_inplace(legate_type_of* inptr, + int32_t* argptr, + const size_t volume, + const size_t sort_dim_size, + cudaStream_t stream) { using VAL = legate_type_of; - cub_argsort(inptr, outptr, volume, sort_dim_size); + cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } template ::value>* = nullptr> -void argsort_stable(const legate_type_of* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) +void local_sort_inplace(legate_type_of* inptr, + int32_t* argptr, + const size_t volume, + const size_t sort_dim_size, + cudaStream_t stream) { using VAL = legate_type_of; - thrust_argsort(inptr, outptr, volume, sort_dim_size); + thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; - void operator()(AccessorRO input, - AccessorWO output, + void operator()(const Array& input_array, + Array& output_array, const Pitches& pitches, const Rect& rect, - const bool dense, const size_t volume, const bool argsort, const Legion::DomainPoint global_shape, @@ -344,81 +301,77 @@ struct SortImplBody { const Legion::DomainPoint index_point, const Legion::Domain domain) { + AccessorRO input = input_array.read_accessor(rect); + + bool dense = input.accessor.is_dense_row_major(rect); + #ifdef DEBUG_CUNUMERIC std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; + << ", dense = " << dense << ", argsort. = " << argsort << std::endl; #endif - assert(!argsort); + + auto stream = get_cached_stream(); + const size_t sort_dim_size = global_shape[DIM - 1]; assert(!is_index_space || DIM > 1); // not implemented for now + + // make a copy of the input + auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); if (dense) { - sort_stable(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + cudaMemcpyAsync(dense_input_copy.ptr(0), + input.ptr(rect.lo), + sizeof(VAL) * volume, + cudaMemcpyDeviceToDevice, + stream); } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } - } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - sort_stable( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; - } + const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + copy_into_buffer<<>>( + dense_input_copy.ptr(0), input, rect.lo, pitches, volume); } - } -}; -template -struct SortImplBody { - using VAL = legate_type_of; + // we need a buffer for argsort + auto indices_buffer = + create_buffer(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM); - void operator()(AccessorRO input, - AccessorWO output, - const Pitches& pitches, - const Rect& rect, - const bool dense, - const size_t volume, - const bool argsort, - const Legion::DomainPoint global_shape, - const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain) - { -#ifdef DEBUG_CUNUMERIC - std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; -#endif - assert(argsort); - const size_t sort_dim_size = global_shape[DIM - 1]; - assert(!is_index_space || DIM > 1); // not implemented for now + // sort data + local_sort_inplace(dense_input_copy.ptr(0), + argsort ? indices_buffer.ptr(0) : nullptr, + volume, + sort_dim_size, + stream); + + // copy back data (we assume output partition to be aliged to input!) if (dense) { - argsort_stable(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); - } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + cudaMemcpyAsync(output.ptr(rect.lo), + indices_buffer.ptr(0), + sizeof(int32_t) * volume, + cudaMemcpyDeviceToDevice, + stream); + } else { + AccessorWO output = output_array.write_accessor(rect); + cudaMemcpyAsync(output.ptr(rect.lo), + dense_input_copy.ptr(0), + sizeof(VAL) * volume, + cudaMemcpyDeviceToDevice, + stream); } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - argsort_stable( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; + } else { + const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + copy_into_output<<>>( + output, indices_buffer.ptr(0), rect.lo, pitches, volume); + } else { + AccessorWO output = output_array.write_accessor(rect); + copy_into_output<<>>( + output, dense_input_copy.ptr(0), rect.lo, pitches, volume); } } + CHECK_CUDA(cudaStreamSynchronize(stream)); } }; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 1fc560617..a728ebeb1 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -17,7 +17,9 @@ #include "cunumeric/sort/sort.h" #include "cunumeric/sort/sort_template.inl" -#include +#include +#include +#include #include namespace cunumeric { @@ -26,30 +28,38 @@ using namespace Legion; using namespace legate; template -struct SortImplBody { +struct SortImplBody { using VAL = legate_type_of; - void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size) + // sorts inptr in-place, if argptr not nullptr it returns sort indices + void thrust_local_sort_inplace(VAL* inptr, + int32_t* argptr, + const size_t volume, + const size_t sort_dim_size) { - std::copy(inptr, inptr + volume, outptr); - if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune - { -#pragma omp do schedule(dynamic) - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); + if (argptr == nullptr) { + // sort (in place) +#pragma omp parallel for + for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); } } else { - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - __gnu_parallel::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size); + // argsort +#pragma omp parallel for + for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { + int32_t* segmentValues = argptr + start_idx; + VAL* segmentKeys = inptr + start_idx; + std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init + thrust::stable_sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); } } } - void operator()(AccessorRO input, - AccessorWO output, + void operator()(const Array& input_array, + Array& output_array, const Pitches& pitches, const Rect& rect, - const bool dense, const size_t volume, const bool argsort, const Legion::DomainPoint global_shape, @@ -57,123 +67,65 @@ struct SortImplBody { const Legion::DomainPoint index_point, const Legion::Domain domain) { + AccessorRO input = input_array.read_accessor(rect); + + bool dense = input.accessor.is_dense_row_major(rect); + #ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume + std::cout << "OMP(" << getRank(domain, index_point) << "): local size = " << volume << ", dist. = " << is_index_space << ", index_point = " << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; + << ", dense = " << dense << ", argsort. = " << argsort << std::endl; #endif + const size_t sort_dim_size = global_shape[DIM - 1]; assert(!is_index_space || DIM > 1); // not implemented for now + + // make a copy of the input + auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::SOCKET_MEM); if (dense) { - std_sort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, dense_input_copy.ptr(0)); } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } - } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - std_sort_omp( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; + auto* target = dense_input_copy.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + target[offset] = input[rect.lo + point]; } } - } -}; -template -struct SortImplBody { - using VAL = legate_type_of; + // we need a buffer for argsort + auto indices_buffer = + create_buffer(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM); - void std_argsort(const VAL* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) - { - for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentKeys = outptr + start_idx; - const VAL* segmentValues = inptr + start_idx; - std::iota(outptr + start_idx, outptr + start_idx + sort_dim_size, 0); - std::stable_sort( - segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { - return segmentValues[i1] < segmentValues[i2]; - }); - } - } - - void std_argsort_omp(const VAL* inptr, - int32_t* outptr, - const size_t volume, - const size_t sort_dim_size) - { - if (volume / sort_dim_size > omp_get_max_threads() / 2) // TODO fine tune - { -#pragma omp do schedule(dynamic) - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentKeys = outptr + start_idx; - const VAL* segmentValues = inptr + start_idx; - std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); - std::stable_sort( - segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { - return segmentValues[i1] < segmentValues[i2]; - }); - } - } else { - for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentKeys = outptr + start_idx; - const VAL* segmentValues = inptr + start_idx; - std::iota(segmentKeys, segmentKeys + sort_dim_size, 0); - __gnu_parallel::stable_sort( - segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) { - return segmentValues[i1] < segmentValues[i2]; - }); - } - } - } + // sort data + thrust_local_sort_inplace( + dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); - void operator()(AccessorRO input, - AccessorWO output, - const Pitches& pitches, - const Rect& rect, - const bool dense, - const size_t volume, - const bool argsort, - const Legion::DomainPoint global_shape, - const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain) - { -#ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << std::endl; -#endif - const size_t sort_dim_size = global_shape[DIM - 1]; - assert(!is_index_space || DIM > 1); // not implemented for now + // copy back data (we assume output partition to be aliged to input!) if (dense) { - std_argsort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size); - } else { - // compute contiguous memory block - int contiguous_elements = 1; - for (int i = DIM - 1; i >= 0; i--) { - auto diff = 1 + rect.hi[i] - rect.lo[i]; - contiguous_elements *= diff; - if (diff < global_shape[i]) { break; } + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); + } else { + AccessorWO output = output_array.write_accessor(rect); + std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } - - uint64_t elements_processed = 0; - while (elements_processed < volume) { - Legion::Point start_point = pitches.unflatten(elements_processed, rect.lo); - std_argsort_omp( - input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size); - elements_processed += contiguous_elements; + } else { + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + auto* source = indices_buffer.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + output[rect.lo + point] = source[offset]; + } + } else { + AccessorWO output = output_array.write_accessor(rect); + auto* source = dense_input_copy.ptr(0); + for (size_t offset = 0; offset < volume; ++offset) { + auto point = pitches.unflatten(offset, rect.lo); + output[rect.lo + point] = source[offset]; + } } } } diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index c00b4a7e8..57ae935ad 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -21,7 +21,7 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template +template struct SortImplBody; static int getRank(Domain domain, DomainPoint index_point) @@ -69,53 +69,16 @@ struct SortImpl { "multi-dimensional array should not be distributed in (sort) dimension"); #endif - auto input = args.input.read_accessor(rect); - - if (args.argsort) { - auto output = args.output.write_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - bool dense = - input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); -#else - bool dense = false; -#endif - assert(dense || !args.is_index_space || DIM > 1); - - SortImplBody()(input, - output, - pitches, - rect, - dense, - volume, - args.argsort, - args.global_shape, - args.is_index_space, - args.task_index, - args.launch_domain); - - } else { - auto output = args.output.write_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - bool dense = - input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect); -#else - bool dense = false; -#endif - assert(dense || !args.is_index_space || DIM > 1); - SortImplBody()(input, - output, - pitches, - rect, - dense, - volume, - args.argsort, - args.global_shape, - args.is_index_space, - args.task_index, - args.launch_domain); - } + SortImplBody()(args.input, + args.output, + pitches, + rect, + volume, + args.argsort, + args.global_shape, + args.is_index_space, + args.task_index, + args.launch_domain); } }; diff --git a/tests/sort.py b/tests/sort.py index 24385b06a..a085c0cbe 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -218,6 +218,7 @@ def test_dtypes(): test_api(generate_random((2, 5, 7), np.uint8)) test_api(generate_random((8, 5), np.uint16)) test_api(generate_random((22, 5, 7), np.uint32)) + test_api(generate_random((220,), np.uint32)) test_api(generate_random((2, 5, 7), np.int8)) test_api(generate_random((8, 5), np.int16)) @@ -227,9 +228,11 @@ def test_dtypes(): test_api(generate_random((8, 5), np.float32)) test_api(generate_random((8, 5), np.float64)) test_api(generate_random((22, 5, 7), np.double)) + test_api(generate_random((220,), np.double)) test_api(generate_random((2, 5, 7), np.complex64)) test_api(generate_random((2, 5, 7), np.complex128)) + test_api(generate_random((220,), np.complex128)) return From 6a061494debb41666f469851f7e0dadcbc6a35b0 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 25 Feb 2022 09:51:19 -0800 Subject: [PATCH 22/49] change argsort return type to int64 --- cunumeric/module.py | 2 +- src/cunumeric/sort/sort.cc | 10 ++++----- src/cunumeric/sort/sort.cu | 38 +++++++++++++++++----------------- src/cunumeric/sort/sort_omp.cc | 10 ++++----- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 4f6059f1c..0d6bc6e08 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5633,7 +5633,7 @@ def argsort(a, axis=-1, kind="stable", order=None): Single GPU, Single CPU """ - result = ndarray(a.shape, np.int32) + result = ndarray(a.shape, np.int64) result._thunk.sort( rhs=a._thunk, argsort=True, axis=axis, kind=kind, order=order ) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 30bcd4592..8bea7b5a6 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -33,7 +33,7 @@ struct SortImplBody { // sorts inptr in-place, if argptr not nullptr it returns sort indices void thrust_local_sort_inplace(VAL* inptr, - int32_t* argptr, + int64_t* argptr, const size_t volume, const size_t sort_dim_size) { @@ -45,7 +45,7 @@ struct SortImplBody { } else { // argsort for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentValues = argptr + start_idx; + int64_t* segmentValues = argptr + start_idx; VAL* segmentKeys = inptr + start_idx; std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init thrust::stable_sort_by_key( @@ -93,7 +93,7 @@ struct SortImplBody { } // we need a buffer for argsort - auto indices_buffer = create_buffer(argsort ? volume : 0); + auto indices_buffer = create_buffer(argsort ? volume : 0); // sort data thrust_local_sort_inplace( @@ -102,7 +102,7 @@ struct SortImplBody { // copy back data (we assume output partition to be aliged to input!) if (dense) { if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); } else { AccessorWO output = output_array.write_accessor(rect); @@ -110,7 +110,7 @@ struct SortImplBody { } } else { if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); auto* source = indices_buffer.ptr(0); for (size_t offset = 0; offset < volume; ++offset) { auto point = pitches.unflatten(offset, rect.lo); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 49edec9e1..2ba8bba3d 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -70,7 +70,7 @@ struct multiply : public thrust::unary_function { template void cub_local_sort_inplace( - VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) + VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) { // make a copy of input --> we want inptr to return sorted values auto keys_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); @@ -125,13 +125,13 @@ void cub_local_sort_inplace( stream); } } else { - auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); thrust::transform(thrust::cuda::par.on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), idx_in.ptr(0), - thrust::modulus()); + thrust::modulus()); if (volume == sort_dim_size) { // argsort @@ -202,13 +202,13 @@ void cub_local_sort_inplace( template void thrust_local_sort_inplace( - VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) + VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) { if (argptr == nullptr) { if (volume == sort_dim_size) { thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume); } else { - auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); // init combined keys thrust::transform(thrust::cuda::par.on(stream), thrust::make_counting_iterator(0), @@ -226,16 +226,16 @@ void thrust_local_sort_inplace( } else { // intialize indices thrust::transform(thrust::cuda::par.on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), argptr, - thrust::modulus()); + thrust::modulus()); if (volume == sort_dim_size) { thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); } else { - auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); // init combined keys thrust::transform(thrust::cuda::par.on(stream), thrust::make_counting_iterator(0), @@ -266,7 +266,7 @@ struct support_cub : std::false_type { template ::value>* = nullptr> void local_sort_inplace(legate_type_of* inptr, - int32_t* argptr, + int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) @@ -277,7 +277,7 @@ void local_sort_inplace(legate_type_of* inptr, template ::value>* = nullptr> void local_sort_inplace(legate_type_of* inptr, - int32_t* argptr, + int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) @@ -333,7 +333,7 @@ struct SortImplBody { // we need a buffer for argsort auto indices_buffer = - create_buffer(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM); + create_buffer(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM); // sort data local_sort_inplace(dense_input_copy.ptr(0), @@ -345,10 +345,10 @@ struct SortImplBody { // copy back data (we assume output partition to be aliged to input!) if (dense) { if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); cudaMemcpyAsync(output.ptr(rect.lo), indices_buffer.ptr(0), - sizeof(int32_t) * volume, + sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream); } else { @@ -362,7 +362,7 @@ struct SortImplBody { } else { const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); copy_into_output<<>>( output, indices_buffer.ptr(0), rect.lo, pitches, volume); } else { diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index a728ebeb1..0e8cbe7e6 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -33,7 +33,7 @@ struct SortImplBody { // sorts inptr in-place, if argptr not nullptr it returns sort indices void thrust_local_sort_inplace(VAL* inptr, - int32_t* argptr, + int64_t* argptr, const size_t volume, const size_t sort_dim_size) { @@ -47,7 +47,7 @@ struct SortImplBody { // argsort #pragma omp parallel for for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - int32_t* segmentValues = argptr + start_idx; + int64_t* segmentValues = argptr + start_idx; VAL* segmentKeys = inptr + start_idx; std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init thrust::stable_sort_by_key( @@ -96,7 +96,7 @@ struct SortImplBody { // we need a buffer for argsort auto indices_buffer = - create_buffer(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM); + create_buffer(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM); // sort data thrust_local_sort_inplace( @@ -105,7 +105,7 @@ struct SortImplBody { // copy back data (we assume output partition to be aliged to input!) if (dense) { if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); } else { AccessorWO output = output_array.write_accessor(rect); @@ -113,7 +113,7 @@ struct SortImplBody { } } else { if (argsort) { - AccessorWO output = output_array.write_accessor(rect); + AccessorWO output = output_array.write_accessor(rect); auto* source = indices_buffer.ptr(0); for (size_t offset = 0; offset < volume; ++offset) { auto point = pitches.unflatten(offset, rect.lo); From ca889b935e1b26fe4b340d9c286d88670f0ea845 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 25 Feb 2022 13:50:56 -0800 Subject: [PATCH 23/49] resolved earlier merge issue --- cunumeric/array.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index eaa949cbc..318b3ea18 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1480,9 +1480,7 @@ def setfield(self, val, dtype, offset=0): ) def setflags(self, write=None, align=None, uic=None): - self.__array__(stacklevel=2).setflags( - write=write, align=align, uic=uic - ) + self.__array__().setflags(write=write, align=align, uic=uic) def sort(self, axis=-1, kind="stable", order=None): self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order) From 5897c686ce7d408a1abacf9e8c6ff07e51fd74b1 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 28 Feb 2022 02:51:37 -0800 Subject: [PATCH 24/49] deactivate test for dimesions > 4 --- tests/sort.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sort.py b/tests/sort.py index a085c0cbe..3cb593abb 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -246,8 +246,8 @@ def test(): test_3D(51, 23, 17) print("\n\n ----------- 3D test (complex) -----\n") test_3D_complex(27, 30, 45) - print("\n\n ----------- 4D/5D test-------------\n") - test_custom() + # print("\n\n ----------- 4D/5D test-------------\n") + # test_custom() print("\n\n ----------- API test --------------\n") test_api() print("\n\n ----------- dtype test ------------\n") From e24eccaf4c0a8dde5a426db2a6a01d4142b53327 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 2 Mar 2022 14:13:54 -0800 Subject: [PATCH 25/49] Distributed 1-D Sort on GPU --- cunumeric/sorting/sorting.py | 28 +- src/cunumeric/sort/sort.cc | 3 +- src/cunumeric/sort/sort.cu | 404 +++++++++++++++++++++++---- src/cunumeric/sort/sort_omp.cc | 3 +- src/cunumeric/sort/sort_template.inl | 20 +- tests/sort.py | 43 ++- 6 files changed, 405 insertions(+), 96 deletions(-) diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting/sorting.py index 246b3abe3..b2c72a1a2 100644 --- a/cunumeric/sorting/sorting.py +++ b/cunumeric/sorting/sorting.py @@ -57,30 +57,34 @@ def sort_swapped(output, input, argsort, sort_axis): def sort_task(output, input, argsort): - needs_communication = output.runtime.num_gpus > 1 or ( - output.runtime.num_gpus == 0 and output.runtime.num_procs > 1 - ) + task = output.context.create_task(CuNumericOpCode.SORT) - if needs_communication: - output.runtime.legate_runtime.issue_execution_fence(block=True) + needs_unbound_output = output.runtime.num_gpus > 1 and input.ndim == 1 - task = output.context.create_task(CuNumericOpCode.SORT) + if needs_unbound_output: + unbound = output.runtime.create_unbound_thunk(dtype=output.dtype) + task.add_output(unbound.base) + else: + task.add_output(output.base) + task.add_alignment(output.base, input.base) - task.add_output(output.base) task.add_input(input.base) - task.add_alignment(output.base, input.base) + if output.ndim > 1: task.add_broadcast(input.base, input.ndim - 1) - elif needs_communication: - # print("Distributed 1D sort --> broadcast") + elif output.runtime.num_gpus > 0: + task.add_nccl_communicator() + elif output.runtime.num_procs > 1: + # Distributed 1D sort on CPU not supported yet task.add_broadcast(input.base) task.add_scalar_arg(argsort, bool) # return indices flag task.add_scalar_arg(input.base.shape, (ty.int32,)) task.execute() - if needs_communication: - output.runtime.legate_runtime.issue_execution_fence(block=True) + if needs_unbound_output: + output.base = unbound.base + output.numpy_array = None def sorting(output, input, argsort, axis=-1): diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 8bea7b5a6..2a7264a39 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -63,7 +63,8 @@ struct SortImplBody { const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, - const Legion::Domain domain) + const Legion::Domain domain, + const std::vector& comms) { AccessorRO input = input_array.read_accessor(rect); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 2ba8bba3d..6a25249e9 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -21,10 +21,12 @@ #include #include #include +#include #include #include #include #include +#include #include "cunumeric/cuda_help.h" @@ -126,12 +128,8 @@ void cub_local_sort_inplace( } } else { auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - thrust::transform(thrust::cuda::par.on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), - idx_in.ptr(0), - thrust::modulus()); + cudaMemcpyAsync( + idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream); if (volume == sort_dim_size) { // argsort @@ -224,14 +222,6 @@ void thrust_local_sort_inplace( thrust::less>()); } } else { - // intialize indices - thrust::transform(thrust::cuda::par.on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(volume), - thrust::make_constant_iterator(sort_dim_size), - argptr, - thrust::modulus()); - if (volume == sort_dim_size) { thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); } else { @@ -272,7 +262,7 @@ void local_sort_inplace(legate_type_of* inptr, cudaStream_t stream) { using VAL = legate_type_of; - cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + if (volume > 0) { cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } } template ::value>* = nullptr> @@ -283,7 +273,300 @@ void local_sort_inplace(legate_type_of* inptr, cudaStream_t stream) { using VAL = legate_type_of; - thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + if (volume > 0) { thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } +} + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + print_subset(const VAL* data, const size_t volume, const size_t rank) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx == 0) { + printf("data(%d) = [ ", rank); + for (int i = 0; i < volume; ++i) { printf("%d ", data[i]); } + printf("]\n"); + } +} + +// auto align to multiples of 16 bytes +auto get_aligned_size = [](auto size) { return std::max(16, (size + 15) / 16 * 16); }; + +template +struct SortPiece { + Buffer values; + Buffer indices; + size_t size; +}; + +template +struct Sample { + VAL value; + int32_t rank; + size_t position; +}; + +template +struct SampleComparator : public thrust::binary_function, Sample, bool> { + __host__ __device__ bool operator()(const Sample& lhs, const Sample& rhs) const + { + // special case for unused samples + if (lhs.rank < 0 || rhs.rank < 0) { return rhs.rank < 0 && lhs.rank >= 0; } + + if (lhs.value != rhs.value) { + return lhs.value < rhs.value; + } else if (lhs.rank != rhs.rank) { + return lhs.rank < rhs.rank; + } else { + return lhs.position < rhs.position; + } + } +}; + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + extract_samples(const VAL* data, + const size_t volume, + Sample* samples, + const size_t num_local_samples, + const Sample init_sample, + const size_t offset, + const size_t rank) +{ + const size_t sample_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (sample_idx >= num_local_samples) return; + + if (num_local_samples < volume) { + const size_t index = (sample_idx + 1) * volume / num_local_samples - 1; + samples[offset + sample_idx].value = data[index]; + samples[offset + sample_idx].rank = rank; + samples[offset + sample_idx].position = index; + // printf("Sample rank %lu position %lu offset %lu\n", rank, index, (offset+sample_idx)); + } else { + // edge case where num_local_samples > volume + if (sample_idx < volume) { + samples[offset + sample_idx].value = data[sample_idx]; + samples[offset + sample_idx].rank = rank; + samples[offset + sample_idx].position = sample_idx; + } else { + samples[offset + sample_idx] = init_sample; + } + } +} + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + extract_split_positions(const VAL* data, + const size_t volume, + const Sample* samples, + const size_t num_samples, + size_t* split_positions, + const size_t num_splitters, + const size_t rank) +{ + const size_t splitter_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (splitter_idx >= num_splitters) return; + + const size_t index = (splitter_idx + 1) * num_samples / (num_splitters + 1) - 1; + const Sample splitter = samples[index]; + + // now perform search on data to receive position *after* last element to be + // part of the package for rank splitter_idx + if (rank > splitter.rank) { + // position of the last position with smaller value than splitter.value + 1 + split_positions[splitter_idx] = cub::LowerBound(data, volume, splitter.value); + } else if (rank < splitter.rank) { + // position of the first position with value larger than splitter.value + split_positions[splitter_idx] = cub::UpperBound(data, volume, splitter.value); + } else { + split_positions[splitter_idx] = splitter.position + 1; + } + // printf("Splitter position id %lu rank %lu position %lu num_samples %lu\n", splitter_idx, rank, + // split_positions[splitter_idx], num_samples); +} + +template +static SortPiece sample_sort_nccl(SortPiece local_sorted, + size_t my_rank, + size_t num_ranks, + bool argsort, + cudaStream_t stream, + ncclComm_t* comm) +{ + size_t volume = local_sorted.size; + + // collect local samples + size_t num_local_samples = num_ranks; // handle case numRanks > volume!! + size_t num_global_samples = num_local_samples * num_ranks; + auto samples = create_buffer>(num_global_samples, Memory::GPU_FB_MEM); + + Sample init_sample; + { + const size_t num_blocks = (num_local_samples + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + init_sample.rank = -1; // init samples that are not populated + size_t offset = num_local_samples * my_rank; + extract_samples<<>>(local_sorted.values.ptr(0), + volume, + samples.ptr(0), + num_local_samples, + init_sample, + offset, + my_rank); + } + + // AllGather: check alignment? as we want to receive data in-place we take exact size for now + CHECK_NCCL(ncclAllGather(samples.ptr(my_rank * num_ranks), + samples.ptr(0), + num_ranks * sizeof(Sample), + ncclInt8, + *comm, + stream)); + + // sort samples on device + thrust::stable_sort(thrust::cuda::par.on(stream), + samples.ptr(0), + samples.ptr(0) + num_global_samples, + SampleComparator()); + + auto lower_bound = thrust::lower_bound(thrust::cuda::par.on(stream), + samples.ptr(0), + samples.ptr(0) + num_global_samples, + init_sample, + SampleComparator()); + size_t num_usable_samples = lower_bound - samples.ptr(0); + + // select splitters / positions based on samples (on device) + const size_t num_splitters = num_ranks - 1; + auto split_positions = create_buffer(num_splitters, Memory::Z_COPY_MEM); + { + const size_t num_blocks = (num_splitters + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + VAL init_value = std::numeric_limits::max(); + extract_split_positions<<>>( + local_sorted.values.ptr(0), + volume, + samples.ptr(0), + num_usable_samples, + split_positions.ptr(0), + num_splitters, + my_rank); + } + + // need to sync as we share values in between host/device + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // collect sizes2send, send to rank i: local_sort_data from positions split_positions[i-1], + // split_positions[i] - 1 + auto size_send = create_buffer(num_ranks, Memory::Z_COPY_MEM); + { + size_t last_position = 0; + for (size_t rank = 0; rank < num_ranks - 1; ++rank) { + size_t cur_position = split_positions[rank]; + size_send[rank] = cur_position - last_position; + last_position = cur_position; + } + size_send[num_ranks - 1] = volume - last_position; + } + + // need to sync as we share values in between host/device + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // all2all exchange send/receive sizes + auto size_recv = create_buffer(num_ranks, Memory::Z_COPY_MEM); + CHECK_NCCL(ncclGroupStart()); + for (int r = 0; r < num_ranks; r++) { + CHECK_NCCL(ncclSend(size_send.ptr(r), 1, ncclUint64, r, *comm, stream)); + CHECK_NCCL(ncclRecv(size_recv.ptr(r), 1, ncclUint64, r, *comm, stream)); + } + CHECK_NCCL(ncclGroupEnd()); + + // need to sync as we share values in between host/device + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // allocate merge targets, data transfer... + std::vector> merge_buffers(num_ranks); + + for (int i = 0; i < merge_buffers.size(); ++i) { + // align buffer to allow data transfer of 16byte blocks + auto recv_size_aligned = get_aligned_size(size_recv[i] * sizeof(VAL)); + auto buf_size = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL); + merge_buffers[i].values = create_buffer(buf_size, Memory::GPU_FB_MEM); + merge_buffers[i].indices = create_buffer(argsort ? buf_size : 0, Memory::GPU_FB_MEM); + merge_buffers[i].size = size_recv[i]; + } + size_t send_pos = 0; + CHECK_NCCL(ncclGroupStart()); + for (int r = 0; r < num_ranks; r++) { + CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos), + get_aligned_size(size_send[r] * sizeof(VAL)), + ncclInt8, + r, + *comm, + stream)); + CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0), + get_aligned_size(size_recv[r] * sizeof(VAL)), + ncclInt8, + r, + *comm, + stream)); + if (argsort) { + CHECK_NCCL( + ncclSend(local_sorted.indices.ptr(send_pos), size_send[r], ncclInt64, r, *comm, stream)); + CHECK_NCCL( + ncclRecv(merge_buffers[r].indices.ptr(0), size_recv[r], ncclInt64, r, *comm, stream)); + } + send_pos += size_send[r]; + } + CHECK_NCCL(ncclGroupEnd()); + + // now merge sort all into the result buffer + // maybe k-way merge is more efficient here... + for (size_t stride = 1; stride < num_ranks; stride *= 2) { + for (size_t pos = 0; pos + stride < num_ranks; pos += 2 * stride) { + SortPiece source1 = merge_buffers[pos]; + SortPiece source2 = merge_buffers[pos + stride]; + auto merged_size = source1.size + source2.size; + auto merged_values = create_buffer(merged_size); + auto merged_indices = source1.indices; // will be overriden for argsort + auto p_merged_values = merged_values.ptr(0); + auto p_values1 = source1.values.ptr(0); + auto p_values2 = source2.values.ptr(0); + if (argsort) { + merged_indices = create_buffer(merged_size); + // merge with key/value + auto p_indices1 = source1.indices.ptr(0); + auto p_indices2 = source2.indices.ptr(0); + auto p_merged_indices = merged_indices.ptr(0); + thrust::merge_by_key(thrust::cuda::par.on(stream), + p_values1, + p_values1 + source1.size, + p_values2, + p_values2 + source2.size, + p_indices1, + p_indices2, + p_merged_values, + p_merged_indices); + CHECK_CUDA(cudaStreamSynchronize(stream)); + source1.indices.destroy(); + } else { + thrust::merge(thrust::cuda::par.on(stream), + p_values1, + p_values1 + source1.size, + p_values2, + p_values2 + source2.size, + p_merged_values); + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + source1.values.destroy(); + source2.values.destroy(); + source2.indices.destroy(); + + merge_buffers[pos].values = merged_values; + merge_buffers[pos].indices = merged_indices; + merge_buffers[pos].size = merged_size; + } + } + return merge_buffers[0]; } template @@ -299,41 +582,54 @@ struct SortImplBody { const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, - const Legion::Domain domain) + const Legion::Domain domain, + const std::vector& comms) { AccessorRO input = input_array.read_accessor(rect); - bool dense = input.accessor.is_dense_row_major(rect); + size_t my_rank = getRank(domain, index_point); #ifdef DEBUG_CUNUMERIC - std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << ", argsort. = " << argsort << std::endl; + std::cout << "GPU(" << my_rank << "): local size = " << volume << ", dist. = " << is_index_space + << ", index_point = " << index_point << ", domain/volume = " << domain << "/" + << domain.get_volume() << ", dense = " << input.accessor.is_dense_row_major(rect) + << ", argsort. = " << argsort << std::endl; #endif + assert(rect.empty() || input.accessor.is_dense_row_major(rect)); + auto stream = get_cached_stream(); - const size_t sort_dim_size = global_shape[DIM - 1]; - assert(!is_index_space || DIM > 1); // not implemented for now + const size_t sort_dim_size = DIM == 1 ? volume : global_shape[DIM - 1]; // make a copy of the input auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - if (dense) { - cudaMemcpyAsync(dense_input_copy.ptr(0), - input.ptr(rect.lo), - sizeof(VAL) * volume, - cudaMemcpyDeviceToDevice, - stream); - } else { - const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - copy_into_buffer<<>>( - dense_input_copy.ptr(0), input, rect.lo, pitches, volume); - } + cudaMemcpyAsync(dense_input_copy.ptr(0), + input.ptr(rect.lo), + sizeof(VAL) * volume, + cudaMemcpyDeviceToDevice, + stream); // we need a buffer for argsort auto indices_buffer = create_buffer(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM); + if (argsort && volume > 0) { + // intialize + if (DIM == 1) { + size_t offset = DIM > 1 ? 0 : rect.lo[0]; + thrust::sequence(thrust::cuda::par.on(stream), + indices_buffer.ptr(0), + indices_buffer.ptr(0) + volume, + offset); + } else { + thrust::transform(thrust::cuda::par.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(volume), + thrust::make_constant_iterator(sort_dim_size), + indices_buffer.ptr(0), + thrust::modulus()); + } + } // sort data local_sort_inplace(dense_input_copy.ptr(0), @@ -342,10 +638,32 @@ struct SortImplBody { sort_dim_size, stream); - // copy back data (we assume output partition to be aliged to input!) - if (dense) { + // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array. + if (output_array.dim() == -1) { + SortPiece local_sorted; + local_sorted.values = dense_input_copy; + local_sorted.indices = indices_buffer; + local_sorted.size = volume; + SortPiece local_sorted_repartitioned = is_index_space + ? sample_sort_nccl(local_sorted, + my_rank, + domain.get_volume(), + argsort, + stream, + comms[0].get()) + : local_sorted; + if (argsort) { + output_array.return_data(local_sorted_repartitioned.indices, + local_sorted_repartitioned.size); + } else { + output_array.return_data(local_sorted_repartitioned.values, + local_sorted_repartitioned.size); + } + } else { + // copy back data (we assume output partition to be aliged to input!) if (argsort) { AccessorWO output = output_array.write_accessor(rect); + assert(output.accessor.is_dense_row_major(rect)); cudaMemcpyAsync(output.ptr(rect.lo), indices_buffer.ptr(0), sizeof(int64_t) * volume, @@ -353,23 +671,13 @@ struct SortImplBody { stream); } else { AccessorWO output = output_array.write_accessor(rect); + assert(output.accessor.is_dense_row_major(rect)); cudaMemcpyAsync(output.ptr(rect.lo), dense_input_copy.ptr(0), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream); } - } else { - const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - copy_into_output<<>>( - output, indices_buffer.ptr(0), rect.lo, pitches, volume); - } else { - AccessorWO output = output_array.write_accessor(rect); - copy_into_output<<>>( - output, dense_input_copy.ptr(0), rect.lo, pitches, volume); - } } CHECK_CUDA(cudaStreamSynchronize(stream)); } diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 0e8cbe7e6..1416bd394 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -65,7 +65,8 @@ struct SortImplBody { const Legion::DomainPoint global_shape, const bool is_index_space, const Legion::DomainPoint index_point, - const Legion::Domain domain) + const Legion::Domain domain, + const std::vector& comms) { AccessorRO input = input_array.read_accessor(rect); diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 57ae935ad..2360f1068 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -37,16 +37,12 @@ static int getRank(Domain domain, DomainPoint index_point) template struct SortImpl { template - void operator()(SortArgs& args) const + void operator()(SortArgs& args, std::vector& comms) const { using VAL = legate_type_of; auto rect = args.input.shape(); - // we shall not return on empty rectangle in case of distributed data - // as the process might still participate in the parallel sort - if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; - Pitches pitches; size_t volume = pitches.flatten(rect); @@ -55,8 +51,8 @@ struct SortImpl { * 1. Sort is always requested for the 'last' dimension within rect * 2. We have product_of_all_other_dimensions independent sort ranges * 3. if we have more than one participants: - * a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- not implemented yet - * b) ND-case: rect needs to be the full domain in that last dimension + * a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- (only implemented for + * GPU) b) ND-case: rect needs to be the full domain in that last dimension * */ @@ -69,6 +65,10 @@ struct SortImpl { "multi-dimensional array should not be distributed in (sort) dimension"); #endif + // we shall not return on empty rectangle in case of distributed data + // as the process might still participate in the parallel sort + if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; + SortImplBody()(args.input, args.output, pitches, @@ -78,7 +78,8 @@ struct SortImpl { args.global_shape, args.is_index_space, args.task_index, - args.launch_domain); + args.launch_domain, + comms); } }; @@ -99,7 +100,8 @@ static void sort_template(TaskContext& context) !context.is_single_task(), context.get_task_index(), context.get_launch_domain()}; - double_dispatch(args.input.dim(), args.input.code(), SortImpl{}, args); + double_dispatch( + args.input.dim(), args.input.code(), SortImpl{}, args, context.communicators()); } } // namespace cunumeric diff --git a/tests/sort.py b/tests/sort.py index 3cb593abb..9cac07195 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -20,9 +20,9 @@ def compare_assert(a_np, a_num): if not num.allclose(a_np, a_num): - print("numpy:") + print("numpy, shape " + str(a_np.shape) + ":") print(a_np) - print("cuNumeric:") + print("cuNumeric, shape " + str(a_num.shape) + ":") print(a_num) assert False @@ -30,11 +30,11 @@ def compare_assert(a_np, a_num): def test_sort_axis(a_np, a_num, axis): compare_assert(a_np, a_num) print("Sorting axis " + str(axis) + ":") - sort_np = np.sort(a_np, axis) + sort_np = np.sort(a_np, axis, kind="stable") sort_num = num.sort(a_num, axis) compare_assert(sort_np, sort_num) - argsort_np = np.sort(a_np, axis) - argsort_num = num.sort(a_num, axis) + argsort_np = np.argsort(a_np, axis, kind="stable") + argsort_num = num.argsort(a_num, axis) compare_assert(argsort_np, argsort_num) @@ -48,7 +48,6 @@ def test_1D(): sortA_np = np.sort(A_np) print("Result numpy : " + str(sortA_np)) - # pdb.set_trace() sortA_num = num.sort(A_num) print("Result cunumeric: " + str(sortA_num)) compare_assert(sortA_np, sortA_num) @@ -116,24 +115,13 @@ def test_3D_complex(x_dim, y_dim, z_dim): def test_custom(): - # 4D still works, >=5D always falls back to numpy - a = np.arange(4 * 2 * 2 * 4).reshape(4, 2, 2, 4) - a_num = num.array(a) - - test_sort_axis(a, a_num, 1) - test_sort_axis(a, a_num, 2) - test_sort_axis(a, a_num, a.ndim - 1) - - a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape( - 4, 4, 5, 2, 3, 2, 2, 2, 4 - ) + np.random.seed(42) + a = generate_random((4,), np.uint8) + print("Matrix A") + print(a) a_num = num.array(a) - - test_sort_axis(a, a_num, 1) - test_sort_axis(a, a_num, 2) - test_sort_axis(a, a_num, 7) - test_sort_axis(a, a_num, 4) + compare_assert(np.sort_complex(a), num.sort_complex(a_num)) return @@ -145,17 +133,21 @@ def test_api(a=None): # sort axes for i in range(a.ndim): + print("sort axis " + str(i)) compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i)) # flatten + print("sort flattened") compare_assert( np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None) ) # msort + print("msort") compare_assert(np.msort(a), num.msort(a_num)) # sort_complex + print("sort_complex") compare_assert(np.sort_complex(a), num.sort_complex(a_num)) # reverse order sort @@ -174,11 +166,13 @@ def test_api(a=None): # argsort for i in range(a.ndim): compare_assert(a, a_num) + print("argsort axis " + str(i)) compare_assert( np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i) ) # flatten + print("argsort flattened") compare_assert( np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None) ) @@ -210,7 +204,7 @@ def generate_random(shape, datatype): else: print("UNKNOWN type " + str(datatype)) assert False - return a_np + return a_np.reshape(shape) def test_dtypes(): @@ -246,8 +240,6 @@ def test(): test_3D(51, 23, 17) print("\n\n ----------- 3D test (complex) -----\n") test_3D_complex(27, 30, 45) - # print("\n\n ----------- 4D/5D test-------------\n") - # test_custom() print("\n\n ----------- API test --------------\n") test_api() print("\n\n ----------- dtype test ------------\n") @@ -256,3 +248,4 @@ def test(): if __name__ == "__main__": test() + # test_custom() From 3eeebd2dd1af3941cee93438fd7cd228042baae5 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 7 Mar 2022 01:24:18 -0800 Subject: [PATCH 26/49] remove explicit host memory type --- src/cunumeric/sort/sort_omp.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 1416bd394..0fe8b92a4 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -83,7 +83,7 @@ struct SortImplBody { assert(!is_index_space || DIM > 1); // not implemented for now // make a copy of the input - auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::SOCKET_MEM); + auto dense_input_copy = create_buffer(volume); if (dense) { auto* src = input.ptr(rect.lo); std::copy(src, src + volume, dense_input_copy.ptr(0)); @@ -96,8 +96,7 @@ struct SortImplBody { } // we need a buffer for argsort - auto indices_buffer = - create_buffer(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM); + auto indices_buffer = create_buffer(argsort ? volume : 0); // sort data thrust_local_sort_inplace( From 6a7e736628a609192710a7624c666dd3b0a020f3 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 7 Mar 2022 01:37:51 -0800 Subject: [PATCH 27/49] assume all data is dense according to mapping config --- src/cunumeric/sort/sort.cc | 26 +++----------------------- src/cunumeric/sort/sort_omp.cc | 27 +++------------------------ 2 files changed, 6 insertions(+), 47 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 2a7264a39..ca0bf1545 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -78,19 +78,15 @@ struct SortImplBody { #endif const size_t sort_dim_size = global_shape[DIM - 1]; + + assert(dense); assert(!is_index_space || DIM > 1); // not implemented for now // make a copy of the input auto dense_input_copy = create_buffer(volume); - if (dense) { + { auto* src = input.ptr(rect.lo); std::copy(src, src + volume, dense_input_copy.ptr(0)); - } else { - auto* target = dense_input_copy.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - target[offset] = input[rect.lo + point]; - } } // we need a buffer for argsort @@ -109,22 +105,6 @@ struct SortImplBody { AccessorWO output = output_array.write_accessor(rect); std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } - } else { - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - auto* source = indices_buffer.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - output[rect.lo + point] = source[offset]; - } - } else { - AccessorWO output = output_array.write_accessor(rect); - auto* source = dense_input_copy.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - output[rect.lo + point] = source[offset]; - } - } } } }; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 0fe8b92a4..ca9004f1e 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -80,19 +80,14 @@ struct SortImplBody { #endif const size_t sort_dim_size = global_shape[DIM - 1]; + assert(dense); assert(!is_index_space || DIM > 1); // not implemented for now // make a copy of the input auto dense_input_copy = create_buffer(volume); - if (dense) { + { auto* src = input.ptr(rect.lo); std::copy(src, src + volume, dense_input_copy.ptr(0)); - } else { - auto* target = dense_input_copy.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - target[offset] = input[rect.lo + point]; - } } // we need a buffer for argsort @@ -103,7 +98,7 @@ struct SortImplBody { dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); // copy back data (we assume output partition to be aliged to input!) - if (dense) { + { if (argsort) { AccessorWO output = output_array.write_accessor(rect); std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); @@ -111,22 +106,6 @@ struct SortImplBody { AccessorWO output = output_array.write_accessor(rect); std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } - } else { - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - auto* source = indices_buffer.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - output[rect.lo + point] = source[offset]; - } - } else { - AccessorWO output = output_array.write_accessor(rect); - auto* source = dense_input_copy.ptr(0); - for (size_t offset = 0; offset < volume; ++offset) { - auto point = pitches.unflatten(offset, rect.lo); - output[rect.lo + point] = source[offset]; - } - } } } }; From 7483a2b4c1fb2682c89d25f638bf8f2e91e6e138 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 7 Mar 2022 02:28:38 -0800 Subject: [PATCH 28/49] transform to complex datatype AFTER computation --- cunumeric/module.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index cc73b11b8..08c079479 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5860,13 +5860,12 @@ def sort_complex(a): Single GPU, Single CPU """ - # force complex result - if np.issubdtype(a.dtype, np.complexfloating): - out = a + result = sort(a) + # force complex result upon return + if np.issubdtype(result.dtype, np.complexfloating): + return result else: - out = a.astype(np.complex64, copy=True) - - return sort(out) + return result.astype(np.complex64, copy=True) # Searching From e6beb1d4c3c9245ba70c0c02304419d5af950c25 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 04:58:33 -0800 Subject: [PATCH 29/49] review changes python --- cunumeric/array.py | 6 +-- cunumeric/deferred.py | 19 +++----- cunumeric/eager.py | 2 +- cunumeric/module.py | 18 ++++--- cunumeric/{sorting => }/sorting.py | 39 +++++++-------- cunumeric/sorting/__init__.py | 16 ------ examples/sort.py | 78 +++++++++++++++++++----------- install.py | 2 +- tests/sort.py | 33 +------------ 9 files changed, 89 insertions(+), 124 deletions(-) rename cunumeric/{sorting => }/sorting.py (70%) delete mode 100644 cunumeric/sorting/__init__.py diff --git a/cunumeric/array.py b/cunumeric/array.py index 27ed88afb..c9090ed26 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -2771,15 +2771,13 @@ def setflags(self, write=None, align=None, uic=None): """ self.__array__().setflags(write=write, align=align, uic=uic) - def sort(self, axis=-1, kind="stable", order=None): + def sort(self, axis=-1, kind="quicksort", order=None): self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order) - return - def argsort(self, axis=-1, kind="stable", order=None): + def argsort(self, axis=-1, kind="quicksort", order=None): self._thunk.sort( rhs=self._thunk, argsort=True, axis=axis, kind=kind, order=order ) - return def squeeze(self, axis=None): """a.squeeze(axis=None) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 095a9e1d6..654ebfa70 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -32,7 +32,7 @@ UnaryRedCode, ) from .linalg.cholesky import cholesky -from .sorting.sorting import sorting +from .sorting import sorting from .thunk import NumPyThunk from .utils import get_arg_value_dtype @@ -1544,16 +1544,13 @@ def unique(self): return result @auto_convert([1]) - def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): + def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None): + + if kind == "stable": + stable = True + else: + stable = False - if kind != "stable": - self.runtime.warn( - "cuNumeric uses a different (stable) algorithm than " - + str(kind) - + " for sorting", - category=RuntimeWarning, - stacklevel=2, - ) if order is not None: raise NotImplementedError( "cuNumeric does not support sorting with 'order' as " @@ -1562,4 +1559,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim): raise ValueError("invalid axis") - sorting(self, rhs, argsort, axis) + sorting(self, rhs, argsort, axis, stable) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index 30f34e018..59127b985 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -502,7 +502,7 @@ def nonzero(self): result += (EagerArray(self.runtime, array),) return result - def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None): + def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None): self.check_eager_args(rhs, axis, kind, order) if self.deferred is not None: self.deferred.sort(rhs, argsort, axis, kind, order) diff --git a/cunumeric/module.py b/cunumeric/module.py index 08c079479..3b94291b1 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -5712,7 +5712,7 @@ def unique( @add_boilerplate("a") -def argsort(a, axis=-1, kind="stable", order=None): +def argsort(a, axis=-1, kind="quicksort", order=None): """ Returns the indices that would sort an array. @@ -5725,7 +5725,8 @@ def argsort(a, axis=-1, kind="stable", order=None): Axis to sort. By default, the index -1 (the last axis) is used. If None, the flattened array is used. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional - Currently only 'stable' sort is supported + Default is 'quicksort'. The underlying sort algorithm might vary. + The code basically supports 'stable' or *not* 'stable'. order : str or list of str, optional Currently not supported @@ -5746,7 +5747,7 @@ def argsort(a, axis=-1, kind="stable", order=None): Availability -------- - Single GPU, Single CPU + Multiple GPUs, Single CPU """ result = ndarray(a.shape, np.int64) @@ -5782,13 +5783,13 @@ def msort(a): Availability -------- - Single GPU, Single CPU + Multiple GPUs, Single CPU """ return sort(a, axis=0) @add_boilerplate("a") -def sort(a, axis=-1, kind="stable", order=None): +def sort(a, axis=-1, kind="quicksort", order=None): """ Returns a sorted copy of an array. @@ -5801,7 +5802,8 @@ def sort(a, axis=-1, kind="stable", order=None): Axis to sort. By default, the index -1 (the last axis) is used. If None, the flattened array is used. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional - Currently only 'stable' sort is supported + Default is 'quicksort'. The underlying sort algorithm might vary. + The code basically supports 'stable' or *not* 'stable'. order : str or list of str, optional Currently not supported @@ -5822,7 +5824,7 @@ def sort(a, axis=-1, kind="stable", order=None): Availability -------- - Single GPU, Single CPU + Multiple GPUs, Single CPU """ result = ndarray(a.shape, a.dtype) result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order) @@ -5857,7 +5859,7 @@ def sort_complex(a): Availability -------- - Single GPU, Single CPU + Multiple GPUs, Single CPU """ result = sort(a) diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting.py similarity index 70% rename from cunumeric/sorting/sorting.py rename to cunumeric/sorting.py index b2c72a1a2..f8c56fa0b 100644 --- a/cunumeric/sorting/sorting.py +++ b/cunumeric/sorting.py @@ -1,4 +1,4 @@ -# Copyright 2021-2022 NVIDIA Corporation +# Copyright 2022 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,44 +19,40 @@ from legate.core import types as ty -def sort_flattened(output, input, argsort): +def sort_flattened(output, input, argsort, stable): flattened = input.reshape((input.size,), order="C") - flattened_copy = output.runtime.create_empty_thunk( - flattened.shape, dtype=input.dtype, inputs=[input, flattened] - ) - flattened_copy.copy(flattened, deep=True) # run sort flattened -- return 1D solution sort_result = output.runtime.create_empty_thunk( - flattened_copy.shape, dtype=output.dtype, inputs=[flattened_copy] + flattened.shape, dtype=output.dtype, inputs=(flattened,) ) - sorting(sort_result, flattened_copy, argsort) + sorting(sort_result, flattened, argsort, stable=stable) output.base = sort_result.base output.numpy_array = None -def sort_swapped(output, input, argsort, sort_axis): +def sort_swapped(output, input, argsort, sort_axis, stable): assert sort_axis < input.ndim - 1 and sort_axis >= 0 # swap axes swapped = input.swapaxes(sort_axis, input.ndim - 1) swapped_copy = output.runtime.create_empty_thunk( - swapped.shape, dtype=input.dtype, inputs=[input, swapped] + swapped.shape, dtype=input.dtype, inputs=(input, swapped) ) swapped_copy.copy(swapped, deep=True) # run sort on last axis sort_result = output.runtime.create_empty_thunk( - swapped_copy.shape, dtype=output.dtype, inputs=[swapped_copy] + swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,) ) - sorting(sort_result, swapped_copy, argsort) + sorting(sort_result, swapped_copy, argsort, stable=stable) output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base output.numpy_array = None -def sort_task(output, input, argsort): +def sort_task(output, input, argsort, stable): task = output.context.create_task(CuNumericOpCode.SORT) needs_unbound_output = output.runtime.num_gpus > 1 and input.ndim == 1 @@ -80,6 +76,7 @@ def sort_task(output, input, argsort): task.add_scalar_arg(argsort, bool) # return indices flag task.add_scalar_arg(input.base.shape, (ty.int32,)) + task.add_scalar_arg(stable, bool) task.execute() if needs_unbound_output: @@ -87,20 +84,18 @@ def sort_task(output, input, argsort): output.numpy_array = None -def sorting(output, input, argsort, axis=-1): +def sorting(output, input, argsort, axis=-1, stable=False): if axis is None and input.ndim > 1: - sort_flattened(output, input, argsort) + sort_flattened(output, input, argsort, stable) else: if axis is None: - sort_axis = 0 + axis = 0 elif axis < 0: - sort_axis = input.ndim + axis - else: - sort_axis = axis + axis = input.ndim + axis - if sort_axis is not input.ndim - 1: - sort_swapped(output, input, argsort, sort_axis) + if axis is not input.ndim - 1: + sort_swapped(output, input, argsort, axis, stable) else: # run actual sort task - sort_task(output, input, argsort) + sort_task(output, input, argsort, stable) diff --git a/cunumeric/sorting/__init__.py b/cunumeric/sorting/__init__.py deleted file mode 100644 index 8988b3353..000000000 --- a/cunumeric/sorting/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2021-2022 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys as _sys diff --git a/examples/sort.py b/examples/sort.py index 21b503708..47c54f619 100644 --- a/examples/sort.py +++ b/examples/sort.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2021 NVIDIA Corporation +# Copyright 2022 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,64 +16,66 @@ # import argparse -import datetime import numpy from benchmark import run_benchmark +from legate.timing import time import cunumeric -def check_sorted(a, a_numpy, axis=-1): - a_sorted = numpy.sort(a_numpy, axis) +def check_sorted(a, a_sorted, axis=-1): + a_numpy = a.__array__() + a_numpy_sorted = numpy.sort(a_numpy, axis) print("Checking result...") - if cunumeric.allclose(a_sorted, a): + if cunumeric.allclose(a_numpy_sorted, a_sorted): print("PASS!") else: print("FAIL!") - print("NUMPY : " + str(a_sorted)) - print("CUNUMERIC: " + str(a)) + print("NUMPY : " + str(a_numpy_sorted)) + print("CUNUMERIC: " + str(a_sorted)) + assert False -def run_sort(N, shape, axis, datatype, perform_check, timing): +def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): - numpy.random.seed(42) + cunumeric.random.seed(42) newtype = numpy.dtype(datatype).type + if shape is not None: + shape = tuple(shape) + else: + shape = (N,) if numpy.issubdtype(newtype, numpy.integer): - a_numpy = numpy.array( - numpy.random.randint( - numpy.iinfo(newtype).min, numpy.iinfo(newtype).max, size=N - ), - dtype=newtype, + if lower is None: + lower = numpy.iinfo(newtype).min + if upper is None: + upper = numpy.iinfo(newtype).max + a = cunumeric.random.randint(low=lower, high=upper, size=N).astype( + newtype ) + a = a.reshape(shape) elif numpy.issubdtype(newtype, numpy.floating): - a_numpy = numpy.array(numpy.random.random(size=N), dtype=newtype) + a = cunumeric.random.random(shape).astype(newtype) elif numpy.issubdtype(newtype, numpy.complexfloating): - a_numpy = numpy.array( - numpy.random.random(size=N) + numpy.random.random(size=N) * 1j, - dtype=newtype, - ) + a = cunumeric.array( + cunumeric.random.random(shape) + + cunumeric.random.random(shape) * 1j + ).astype(newtype) else: print("UNKNOWN type " + str(newtype)) assert False - if shape is not None: - a_numpy = a_numpy.reshape(tuple(shape)) - - a = cunumeric.array(a_numpy) - - start = datetime.datetime.now() + start = time() a_sorted = cunumeric.sort(a, axis) - stop = datetime.datetime.now() + stop = time() if perform_check: - check_sorted(a_sorted, a_numpy, axis) + check_sorted(a, a_sorted, axis) else: # do we need to synchronize? assert True - delta = stop - start - total = delta.total_seconds() * 1000.0 + total = (stop - start) * 1e-3 if timing: print("Elapsed Time: " + str(total) + " ms") return total @@ -120,6 +122,22 @@ def run_sort(N, shape, axis, datatype, perform_check, timing): dest="datatype", help="data type (default numpy.int32)", ) + parser.add_argument( + "-l", + "--lower", + type=int, + default=None, + dest="lower", + help="lower bound for integer based arrays (inclusive)", + ) + parser.add_argument( + "-u", + "--upper", + type=int, + default=None, + dest="upper", + help="upper bound for integer based arrays (exclusive)", + ) parser.add_argument( "-a", "--axis", @@ -148,6 +166,8 @@ def run_sort(N, shape, axis, datatype, perform_check, timing): args.shape, args.axis, args.datatype, + args.lower, + args.upper, args.check, args.timing, ), diff --git a/install.py b/install.py index 45a9281f6..ad3581810 100755 --- a/install.py +++ b/install.py @@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose): git_clone( temp_dir, url="https://github.com/xianyi/OpenBLAS.git", - tag="v0.3.19", + tag="v0.3.15", verbose=verbose, ) # We can just build this directly diff --git a/tests/sort.py b/tests/sort.py index 9cac07195..50b705364 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -1,4 +1,4 @@ -# Copyright 2021 NVIDIA Corporation +# Copyright 2022 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -56,8 +56,6 @@ def test_1D(): print("Result (inplace): " + str(A_num)) compare_assert(sortA_np, A_num) - return - def test_2D(): np.random.seed(42) @@ -75,8 +73,6 @@ def test_2D(): test_sort_axis(A_np, A_num, 0) test_sort_axis(A_np, A_num, axis=None) - return - def test_3D(x_dim, y_dim, z_dim): np.random.seed(42) @@ -93,8 +89,6 @@ def test_3D(x_dim, y_dim, z_dim): test_sort_axis(A_np, A_num, 0) test_sort_axis(A_np, A_num, axis=None) - return - def test_3D_complex(x_dim, y_dim, z_dim): np.random.seed(42) @@ -111,20 +105,6 @@ def test_3D_complex(x_dim, y_dim, z_dim): test_sort_axis(A_np, A_num, 0) test_sort_axis(A_np, A_num, axis=None) - return - - -def test_custom(): - np.random.seed(42) - a = generate_random((4,), np.uint8) - print("Matrix A") - print(a) - - a_num = num.array(a) - compare_assert(np.sort_complex(a), num.sort_complex(a_num)) - - return - def test_api(a=None): if a is None: @@ -150,9 +130,6 @@ def test_api(a=None): print("sort_complex") compare_assert(np.sort_complex(a), num.sort_complex(a_num)) - # reverse order sort - # TODO - # in-place sort copy_a = a.copy() copy_a_num = a_num.copy() @@ -160,9 +137,6 @@ def test_api(a=None): copy_a_num.sort() compare_assert(copy_a, copy_a_num) - # reverse order sort (in place) - # TODO - # argsort for i in range(a.ndim): compare_assert(a, a_num) @@ -177,8 +151,6 @@ def test_api(a=None): np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None) ) - return - def generate_random(shape, datatype): print("Generate random for " + str(datatype)) @@ -228,8 +200,6 @@ def test_dtypes(): test_api(generate_random((2, 5, 7), np.complex128)) test_api(generate_random((220,), np.complex128)) - return - def test(): print("\n\n ----------- 1D test ---------------\n") @@ -248,4 +218,3 @@ def test(): if __name__ == "__main__": test() - # test_custom() From c7fee9902c98b7d60e29952bc9a97d54c70ceb13 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 09:40:48 -0800 Subject: [PATCH 30/49] review changes C++ signatures and cleanup --- src/cunumeric/sort/sort.cc | 39 +++------ src/cunumeric/sort/sort.cu | 120 ++++++++++----------------- src/cunumeric/sort/sort.h | 26 ++---- src/cunumeric/sort/sort_omp.cc | 38 +++------ src/cunumeric/sort/sort_template.inl | 41 ++++----- 5 files changed, 93 insertions(+), 171 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index ca0bf1545..740622b68 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -1,4 +1,4 @@ -/* Copyright 2021-2022 NVIDIA Corporation +/* Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,27 +59,16 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, const bool argsort, - const Legion::DomainPoint global_shape, + const bool stable, const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain, + const size_t local_rank, + const size_t num_ranks, const std::vector& comms) { - AccessorRO input = input_array.read_accessor(rect); - - bool dense = input.accessor.is_dense_row_major(rect); - -#ifdef DEBUG_CUNUMERIC - std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << ", argsort. = " << argsort << std::endl; -#endif - - const size_t sort_dim_size = global_shape[DIM - 1]; - - assert(dense); + auto input = input_array.read_accessor(rect); + assert(input.accessor.is_dense_row_major(rect)); assert(!is_index_space || DIM > 1); // not implemented for now // make a copy of the input @@ -97,14 +86,12 @@ struct SortImplBody { dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); // copy back data (we assume output partition to be aliged to input!) - if (dense) { - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); - } else { - AccessorWO output = output_array.write_accessor(rect); - std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); - } + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); + } else { + AccessorWO output = output_array.write_accessor(rect); + std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } } }; diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 6a25249e9..b63d61f8e 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -1,4 +1,4 @@ -/* Copyright 2021-2022 NVIDIA Corporation +/* Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,13 +76,14 @@ void cub_local_sort_inplace( { // make a copy of input --> we want inptr to return sorted values auto keys_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream); + CHECK_CUDA( + cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); size_t temp_storage_bytes = 0; if (argptr == nullptr) { if (volume == sort_dim_size) { - // sort + // sort (initial call to compute bufffer size) cub::DeviceRadixSort::SortKeys( - NULL, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream); + nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream); auto temp_storage = create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), @@ -94,13 +95,13 @@ void cub_local_sort_inplace( sizeof(VAL) * 8, stream); } else { - // segmented sort + // segmented sort (initial call to compute bufffer size) auto off_start_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); auto off_end_it = thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); - cub::DeviceSegmentedRadixSort::SortKeys(NULL, + cub::DeviceSegmentedRadixSort::SortKeys(nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, @@ -128,12 +129,12 @@ void cub_local_sort_inplace( } } else { auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - cudaMemcpyAsync( - idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream); + CHECK_CUDA(cudaMemcpyAsync( + idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); if (volume == sort_dim_size) { - // argsort - cub::DeviceRadixSort::SortPairs(NULL, + // argsort (initial call to compute bufffer size) + cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, @@ -158,13 +159,13 @@ void cub_local_sort_inplace( sizeof(VAL) * 8, stream); } else { - // segmented argsort + // segmented argsort (initial call to compute bufffer size) auto off_start_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); auto off_end_it = thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); - cub::DeviceSegmentedRadixSort::SortPairs(NULL, + cub::DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, @@ -276,19 +277,6 @@ void local_sort_inplace(legate_type_of* inptr, if (volume > 0) { thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } } -template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - print_subset(const VAL* data, const size_t volume, const size_t rank) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx == 0) { - printf("data(%d) = [ ", rank); - for (int i = 0; i < volume; ++i) { printf("%d ", data[i]); } - printf("]\n"); - } -} - // auto align to multiples of 16 bytes auto get_aligned_size = [](auto size) { return std::max(16, (size + 15) / 16 * 16); }; @@ -341,7 +329,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) samples[offset + sample_idx].value = data[index]; samples[offset + sample_idx].rank = rank; samples[offset + sample_idx].position = index; - // printf("Sample rank %lu position %lu offset %lu\n", rank, index, (offset+sample_idx)); } else { // edge case where num_local_samples > volume if (sample_idx < volume) { @@ -381,8 +368,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) } else { split_positions[splitter_idx] = splitter.position + 1; } - // printf("Splitter position id %lu rank %lu position %lu num_samples %lu\n", splitter_idx, rank, - // split_positions[splitter_idx], num_samples); } template @@ -473,7 +458,7 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, // all2all exchange send/receive sizes auto size_recv = create_buffer(num_ranks, Memory::Z_COPY_MEM); CHECK_NCCL(ncclGroupStart()); - for (int r = 0; r < num_ranks; r++) { + for (size_t r = 0; r < num_ranks; r++) { CHECK_NCCL(ncclSend(size_send.ptr(r), 1, ncclUint64, r, *comm, stream)); CHECK_NCCL(ncclRecv(size_recv.ptr(r), 1, ncclUint64, r, *comm, stream)); } @@ -485,7 +470,7 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, // allocate merge targets, data transfer... std::vector> merge_buffers(num_ranks); - for (int i = 0; i < merge_buffers.size(); ++i) { + for (size_t i = 0; i < num_ranks; ++i) { // align buffer to allow data transfer of 16byte blocks auto recv_size_aligned = get_aligned_size(size_recv[i] * sizeof(VAL)); auto buf_size = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL); @@ -495,7 +480,7 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, } size_t send_pos = 0; CHECK_NCCL(ncclGroupStart()); - for (int r = 0; r < num_ranks; r++) { + for (size_t r = 0; r < num_ranks; r++) { CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos), get_aligned_size(size_send[r] * sizeof(VAL)), ncclInt8, @@ -545,7 +530,6 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, p_indices2, p_merged_values, p_merged_indices); - CHECK_CUDA(cudaStreamSynchronize(stream)); source1.indices.destroy(); } else { thrust::merge(thrust::cuda::par.on(stream), @@ -554,7 +538,6 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, p_values2, p_values2 + source2.size, p_merged_values); - CHECK_CUDA(cudaStreamSynchronize(stream)); } source1.values.destroy(); @@ -578,37 +561,28 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, const bool argsort, - const Legion::DomainPoint global_shape, + const bool stable, const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain, + const size_t local_rank, + const size_t num_ranks, const std::vector& comms) { - AccessorRO input = input_array.read_accessor(rect); - - size_t my_rank = getRank(domain, index_point); - -#ifdef DEBUG_CUNUMERIC - std::cout << "GPU(" << my_rank << "): local size = " << volume << ", dist. = " << is_index_space - << ", index_point = " << index_point << ", domain/volume = " << domain << "/" - << domain.get_volume() << ", dense = " << input.accessor.is_dense_row_major(rect) - << ", argsort. = " << argsort << std::endl; -#endif + auto input = input_array.read_accessor(rect); + // we allow empty domains for distributed sorting assert(rect.empty() || input.accessor.is_dense_row_major(rect)); auto stream = get_cached_stream(); - const size_t sort_dim_size = DIM == 1 ? volume : global_shape[DIM - 1]; - // make a copy of the input auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - cudaMemcpyAsync(dense_input_copy.ptr(0), - input.ptr(rect.lo), - sizeof(VAL) * volume, - cudaMemcpyDeviceToDevice, - stream); + CHECK_CUDA(cudaMemcpyAsync(dense_input_copy.ptr(0), + input.ptr(rect.lo), + sizeof(VAL) * volume, + cudaMemcpyDeviceToDevice, + stream)); // we need a buffer for argsort auto indices_buffer = @@ -641,17 +615,14 @@ struct SortImplBody { // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array. if (output_array.dim() == -1) { SortPiece local_sorted; - local_sorted.values = dense_input_copy; - local_sorted.indices = indices_buffer; - local_sorted.size = volume; - SortPiece local_sorted_repartitioned = is_index_space - ? sample_sort_nccl(local_sorted, - my_rank, - domain.get_volume(), - argsort, - stream, - comms[0].get()) - : local_sorted; + local_sorted.values = dense_input_copy; + local_sorted.indices = indices_buffer; + local_sorted.size = volume; + SortPiece local_sorted_repartitioned = + is_index_space + ? sample_sort_nccl( + local_sorted, local_rank, num_ranks, argsort, stream, comms[0].get()) + : local_sorted; if (argsort) { output_array.return_data(local_sorted_repartitioned.indices, local_sorted_repartitioned.size); @@ -664,22 +635,21 @@ struct SortImplBody { if (argsort) { AccessorWO output = output_array.write_accessor(rect); assert(output.accessor.is_dense_row_major(rect)); - cudaMemcpyAsync(output.ptr(rect.lo), - indices_buffer.ptr(0), - sizeof(int64_t) * volume, - cudaMemcpyDeviceToDevice, - stream); + CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo), + indices_buffer.ptr(0), + sizeof(int64_t) * volume, + cudaMemcpyDeviceToDevice, + stream)); } else { AccessorWO output = output_array.write_accessor(rect); assert(output.accessor.is_dense_row_major(rect)); - cudaMemcpyAsync(output.ptr(rect.lo), - dense_input_copy.ptr(0), - sizeof(VAL) * volume, - cudaMemcpyDeviceToDevice, - stream); + CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo), + dense_input_copy.ptr(0), + sizeof(VAL) * volume, + cudaMemcpyDeviceToDevice, + stream)); } } - CHECK_CUDA(cudaStreamSynchronize(stream)); } }; diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h index b915df838..b6dc88b5d 100644 --- a/src/cunumeric/sort/sort.h +++ b/src/cunumeric/sort/sort.h @@ -1,4 +1,4 @@ -/* Copyright 2021-2022 NVIDIA Corporation +/* Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,10 +24,11 @@ struct SortArgs { const Array& input; Array& output; bool argsort; - Legion::DomainPoint global_shape; + bool stable; + size_t sort_dim_size; bool is_index_space; - Legion::DomainPoint task_index; - Legion::Domain launch_domain; + size_t local_rank; + size_t num_ranks; }; template @@ -37,23 +38,6 @@ struct SampleEntry { size_t local_id; }; -template -struct SampleEntryComparator { - bool operator()(const SampleEntry& a, const SampleEntry& b) const - { - if (a.value < b.value) { - return true; - } else if (a.value == b.value) { - if (a.rank < b.rank) { - return true; - } else if (a.rank == b.rank) { - return a.local_id < b.local_id; - } - } - return false; - } -}; - class SortTask : public CuNumericTask { public: static const int TASK_ID = CUNUMERIC_SORT; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index ca9004f1e..54ef40109 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -1,4 +1,4 @@ -/* Copyright 2021-2022 NVIDIA Corporation +/* Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,26 +61,16 @@ struct SortImplBody { const Pitches& pitches, const Rect& rect, const size_t volume, + const size_t sort_dim_size, const bool argsort, - const Legion::DomainPoint global_shape, + const bool stable, const bool is_index_space, - const Legion::DomainPoint index_point, - const Legion::Domain domain, + const size_t local_rank, + const size_t num_ranks, const std::vector& comms) { - AccessorRO input = input_array.read_accessor(rect); - - bool dense = input.accessor.is_dense_row_major(rect); - -#ifdef DEBUG_CUNUMERIC - std::cout << "OMP(" << getRank(domain, index_point) << "): local size = " << volume - << ", dist. = " << is_index_space << ", index_point = " << index_point - << ", domain/volume = " << domain << "/" << domain.get_volume() - << ", dense = " << dense << ", argsort. = " << argsort << std::endl; -#endif - - const size_t sort_dim_size = global_shape[DIM - 1]; - assert(dense); + auto input = input_array.read_accessor(rect); + assert(input.accessor.is_dense_row_major(rect)); assert(!is_index_space || DIM > 1); // not implemented for now // make a copy of the input @@ -98,14 +88,12 @@ struct SortImplBody { dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); // copy back data (we assume output partition to be aliged to input!) - { - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); - } else { - AccessorWO output = output_array.write_accessor(rect); - std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); - } + if (argsort) { + AccessorWO output = output_array.write_accessor(rect); + std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); + } else { + AccessorWO output = output_array.write_accessor(rect); + std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); } } }; diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 2360f1068..593b7cc21 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -1,4 +1,4 @@ -/* Copyright 2021-2022 NVIDIA Corporation +/* Copyright 2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ using namespace legate; template struct SortImplBody; -static int getRank(Domain domain, DomainPoint index_point) +static int get_rank(Domain domain, DomainPoint index_point) { int domain_index = 0; for (int i = 0; i < domain.get_dim(); ++i) { @@ -46,6 +46,8 @@ struct SortImpl { Pitches pitches; size_t volume = pitches.flatten(rect); + size_t sort_dim_size = std::min(args.sort_dim_size, volume); + /* * Assumptions: * 1. Sort is always requested for the 'last' dimension within rect @@ -56,15 +58,6 @@ struct SortImpl { * */ -#ifdef DEBUG_CUNUMERIC - std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape - << ", argsort=" << args.argsort << ", sort_dim_size=" << args.global_shape[DIM - 1] - << std::endl; - - assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) && - "multi-dimensional array should not be distributed in (sort) dimension"); -#endif - // we shall not return on empty rectangle in case of distributed data // as the process might still participate in the parallel sort if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; @@ -74,11 +67,12 @@ struct SortImpl { pitches, rect, volume, + sort_dim_size, args.argsort, - args.global_shape, + args.stable, args.is_index_space, - args.task_index, - args.launch_domain, + args.local_rank, + args.num_ranks, comms); } }; @@ -86,20 +80,19 @@ struct SortImpl { template static void sort_template(TaskContext& context) { - DomainPoint global_shape; - { - auto shape_span = context.scalars()[1].values(); - global_shape.dim = shape_span.size(); - for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; } - } + auto shape_span = context.scalars()[1].values(); + size_t sort_dim_size = shape_span[shape_span.size() - 1]; + size_t local_rank = get_rank(context.get_launch_domain(), context.get_task_index()); + size_t num_ranks = context.get_launch_domain().get_volume(); SortArgs args{context.inputs()[0], context.outputs()[0], - context.scalars()[0].value(), - global_shape, + context.scalars()[0].value(), // argsort + context.scalars()[2].value(), // stable + sort_dim_size, !context.is_single_task(), - context.get_task_index(), - context.get_launch_domain()}; + local_rank, + num_ranks}; double_dispatch( args.input.dim(), args.input.code(), SortImpl{}, args, context.communicators()); } From 5a0204796908cbd320a438c745a89d3de2ba2d24 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 09:45:53 -0800 Subject: [PATCH 31/49] non-stable sort for primitive values --- src/cunumeric/sort/sort.cc | 2 +- src/cunumeric/sort/sort.cu | 10 +++++----- src/cunumeric/sort/sort_omp.cc | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 740622b68..6f40927dc 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -40,7 +40,7 @@ struct SortImplBody { if (argptr == nullptr) { // sort (in place) for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); + thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); } } else { // argsort diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index b63d61f8e..6d61c9226 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -205,7 +205,7 @@ void thrust_local_sort_inplace( { if (argptr == nullptr) { if (volume == sort_dim_size) { - thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume); + thrust::sort(thrust::cuda::par.on(stream), inptr, inptr + volume); } else { auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); // init combined keys @@ -217,10 +217,10 @@ void thrust_local_sort_inplace( thrust::divides()); auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); - thrust::stable_sort(thrust::cuda::par.on(stream), - combined, - combined + volume, - thrust::less>()); + thrust::sort(thrust::cuda::par.on(stream), + combined, + combined + volume, + thrust::less>()); } } else { if (volume == sort_dim_size) { diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 54ef40109..b97c66a8a 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -41,7 +41,7 @@ struct SortImplBody { // sort (in place) #pragma omp parallel for for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); + thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); } } else { // argsort From 0c1805f22a0599171efe432a31f1a8ba19e517c4 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 11:48:30 -0800 Subject: [PATCH 32/49] remove copies where possible --- src/cunumeric/sort/sort.cc | 34 ++++++------ src/cunumeric/sort/sort.cu | 99 +++++++++++++++++----------------- src/cunumeric/sort/sort_omp.cc | 34 ++++++------ 3 files changed, 85 insertions(+), 82 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 6f40927dc..f20169284 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -71,27 +71,29 @@ struct SortImplBody { assert(input.accessor.is_dense_row_major(rect)); assert(!is_index_space || DIM > 1); // not implemented for now - // make a copy of the input - auto dense_input_copy = create_buffer(volume); - { - auto* src = input.ptr(rect.lo); - std::copy(src, src + volume, dense_input_copy.ptr(0)); - } + if (argsort) { + // make copy of the input + auto dense_input_copy = create_buffer(volume); + { + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, dense_input_copy.ptr(0)); + } - // we need a buffer for argsort - auto indices_buffer = create_buffer(argsort ? volume : 0); + AccessorWO output = output_array.write_accessor(rect); - // sort data - thrust_local_sort_inplace( - dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); + // sort data in place + thrust_local_sort_inplace( + dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size); - // copy back data (we assume output partition to be aliged to input!) - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); } else { AccessorWO output = output_array.write_accessor(rect); - std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); + + // init output values + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, output.ptr(rect.lo)); + + // sort data in place + thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size); } } }; diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 6d61c9226..1cd60aceb 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -263,7 +263,7 @@ void local_sort_inplace(legate_type_of* inptr, cudaStream_t stream) { using VAL = legate_type_of; - if (volume > 0) { cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } + cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } template ::value>* = nullptr> @@ -274,7 +274,7 @@ void local_sort_inplace(legate_type_of* inptr, cudaStream_t stream) { using VAL = legate_type_of; - if (volume > 0) { thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } + thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); } // auto align to multiples of 16 bytes @@ -576,48 +576,66 @@ struct SortImplBody { auto stream = get_cached_stream(); - // make a copy of the input - auto dense_input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - CHECK_CUDA(cudaMemcpyAsync(dense_input_copy.ptr(0), - input.ptr(rect.lo), - sizeof(VAL) * volume, - cudaMemcpyDeviceToDevice, - stream)); - - // we need a buffer for argsort - auto indices_buffer = - create_buffer(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM); - if (argsort && volume > 0) { - // intialize + // initialize sort pointers + SortPiece local_sorted; + int64_t* indices_ptr = nullptr; + VAL* values_ptr = nullptr; + if (argsort) { + // make a buffer for input + auto input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + values_ptr = input_copy.ptr(0); + + // initialize indices + if (output_array.dim() == -1) { + auto indices_buffer = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + indices_ptr = indices_buffer.ptr(0); + local_sorted.values = input_copy; + local_sorted.indices = indices_buffer; + local_sorted.size = volume; + } else { + AccessorWO output = output_array.write_accessor(rect); + assert(output.accessor.is_dense_row_major(rect)); + indices_ptr = output.ptr(rect.lo); + } if (DIM == 1) { size_t offset = DIM > 1 ? 0 : rect.lo[0]; - thrust::sequence(thrust::cuda::par.on(stream), - indices_buffer.ptr(0), - indices_buffer.ptr(0) + volume, - offset); + if (volume > 0) { + thrust::sequence(thrust::cuda::par.on(stream), indices_ptr, indices_ptr + volume, offset); + } } else { thrust::transform(thrust::cuda::par.on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(volume), thrust::make_constant_iterator(sort_dim_size), - indices_buffer.ptr(0), + indices_ptr, thrust::modulus()); } + } else { + // initialize output + if (output_array.dim() == -1) { + auto input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + values_ptr = input_copy.ptr(0); + local_sorted.values = input_copy; + local_sorted.indices = create_buffer(0, Legion::Memory::Kind::GPU_FB_MEM); + ; + local_sorted.size = volume; + } else { + AccessorWO output = output_array.write_accessor(rect); + assert(output.accessor.is_dense_row_major(rect)); + values_ptr = output.ptr(rect.lo); + } } - // sort data - local_sort_inplace(dense_input_copy.ptr(0), - argsort ? indices_buffer.ptr(0) : nullptr, - volume, - sort_dim_size, - stream); + if (volume > 0) { + CHECK_CUDA(cudaMemcpyAsync( + values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); - // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array. + // sort data (locally) + local_sort_inplace(values_ptr, indices_ptr, volume, sort_dim_size, stream); + } + + // this is linked to the decision in sorting.py on when to use an 'unbounded' output array. if (output_array.dim() == -1) { - SortPiece local_sorted; - local_sorted.values = dense_input_copy; - local_sorted.indices = indices_buffer; - local_sorted.size = volume; SortPiece local_sorted_repartitioned = is_index_space ? sample_sort_nccl( @@ -630,25 +648,6 @@ struct SortImplBody { output_array.return_data(local_sorted_repartitioned.values, local_sorted_repartitioned.size); } - } else { - // copy back data (we assume output partition to be aliged to input!) - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - assert(output.accessor.is_dense_row_major(rect)); - CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo), - indices_buffer.ptr(0), - sizeof(int64_t) * volume, - cudaMemcpyDeviceToDevice, - stream)); - } else { - AccessorWO output = output_array.write_accessor(rect); - assert(output.accessor.is_dense_row_major(rect)); - CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo), - dense_input_copy.ptr(0), - sizeof(VAL) * volume, - cudaMemcpyDeviceToDevice, - stream)); - } } } }; diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index b97c66a8a..192a75333 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -73,27 +73,29 @@ struct SortImplBody { assert(input.accessor.is_dense_row_major(rect)); assert(!is_index_space || DIM > 1); // not implemented for now - // make a copy of the input - auto dense_input_copy = create_buffer(volume); - { - auto* src = input.ptr(rect.lo); - std::copy(src, src + volume, dense_input_copy.ptr(0)); - } + if (argsort) { + // make copy of the input + auto dense_input_copy = create_buffer(volume); + { + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, dense_input_copy.ptr(0)); + } - // we need a buffer for argsort - auto indices_buffer = create_buffer(argsort ? volume : 0); + AccessorWO output = output_array.write_accessor(rect); - // sort data - thrust_local_sort_inplace( - dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size); + // sort data in place + thrust_local_sort_inplace( + dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size); - // copy back data (we assume output partition to be aliged to input!) - if (argsort) { - AccessorWO output = output_array.write_accessor(rect); - std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo)); } else { AccessorWO output = output_array.write_accessor(rect); - std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo)); + + // init output values + auto* src = input.ptr(rect.lo); + std::copy(src, src + volume, output.ptr(rect.lo)); + + // sort data in place + thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size); } } }; From 461ae2b12857ff68d2c9a549b6bfdb87a89b5404 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 12:08:19 -0800 Subject: [PATCH 33/49] fix eager test with new default non-stable sort --- tests/sort.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/sort.py b/tests/sort.py index 50b705364..4937de7ec 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -114,12 +114,16 @@ def test_api(a=None): # sort axes for i in range(a.ndim): print("sort axis " + str(i)) - compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i)) + compare_assert( + np.sort(a, axis=i, kind="stable"), + num.sort(a_num, i, kind="stable"), + ) # flatten print("sort flattened") compare_assert( - np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None) + np.sort(a, axis=None, kind="stable"), + num.sort(a_num, axis=None, kind="stable"), ) # msort @@ -142,13 +146,15 @@ def test_api(a=None): compare_assert(a, a_num) print("argsort axis " + str(i)) compare_assert( - np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i) + np.argsort(a, axis=i, kind="stable"), + num.argsort(a_num, axis=i, kind="stable"), ) # flatten print("argsort flattened") compare_assert( - np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None) + np.argsort(a, axis=None, kind="stable"), + num.argsort(a_num, axis=None, kind="stable"), ) From 763b99c5cbf9f87169a442e40f54f7ad2e73b2f1 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 12:08:56 -0800 Subject: [PATCH 34/49] fix naming conventions --- examples/sort.py | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/examples/sort.py b/examples/sort.py index 47c54f619..6713da232 100644 --- a/examples/sort.py +++ b/examples/sort.py @@ -17,57 +17,54 @@ import argparse -import numpy +import numpy as np from benchmark import run_benchmark from legate.timing import time -import cunumeric +import cunumeric as num def check_sorted(a, a_sorted, axis=-1): - a_numpy = a.__array__() - a_numpy_sorted = numpy.sort(a_numpy, axis) + a_np = a.__array__() + a_np_sorted = np.sort(a_np, axis) print("Checking result...") - if cunumeric.allclose(a_numpy_sorted, a_sorted): + if num.allclose(a_np_sorted, a_sorted): print("PASS!") else: print("FAIL!") - print("NUMPY : " + str(a_numpy_sorted)) + print("NUMPY : " + str(a_np_sorted)) print("CUNUMERIC: " + str(a_sorted)) assert False def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): - cunumeric.random.seed(42) - newtype = numpy.dtype(datatype).type + num.random.seed(42) + newtype = np.dtype(datatype).type if shape is not None: shape = tuple(shape) else: shape = (N,) - if numpy.issubdtype(newtype, numpy.integer): + if np.issubdtype(newtype, np.integer): if lower is None: - lower = numpy.iinfo(newtype).min + lower = np.iinfo(newtype).min if upper is None: - upper = numpy.iinfo(newtype).max - a = cunumeric.random.randint(low=lower, high=upper, size=N).astype( - newtype - ) + upper = np.iinfo(newtype).max + a = num.random.randint(low=lower, high=upper, size=N).astype(newtype) a = a.reshape(shape) - elif numpy.issubdtype(newtype, numpy.floating): - a = cunumeric.random.random(shape).astype(newtype) - elif numpy.issubdtype(newtype, numpy.complexfloating): - a = cunumeric.array( - cunumeric.random.random(shape) - + cunumeric.random.random(shape) * 1j + elif np.issubdtype(newtype, np.floating): + a = num.random.random(shape).astype(newtype) + elif np.issubdtype(newtype, np.complexfloating): + a = num.array( + num.random.random(shape) + num.random.random(shape) * 1j ).astype(newtype) else: print("UNKNOWN type " + str(newtype)) assert False start = time() - a_sorted = cunumeric.sort(a, axis) + a_sorted = num.sort(a, axis) stop = time() if perform_check: @@ -120,7 +117,7 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): type=str, default="uint32", dest="datatype", - help="data type (default numpy.int32)", + help="data type (default np.int32)", ) parser.add_argument( "-l", From b210b69e6f84e6242854f0a8a96a316156a12ea2 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 12:15:09 -0800 Subject: [PATCH 35/49] minor adjustemnts, comments --- src/cunumeric/sort/sort.cc | 2 +- src/cunumeric/sort/sort.cu | 9 ++++----- src/cunumeric/sort/sort_omp.cc | 2 +- src/cunumeric/sort/sort_template.inl | 14 ++++++++++++-- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index f20169284..137c4ea21 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -69,7 +69,7 @@ struct SortImplBody { { auto input = input_array.read_accessor(rect); assert(input.accessor.is_dense_row_major(rect)); - assert(!is_index_space || DIM > 1); // not implemented for now + assert(!is_index_space || DIM > 1); if (argsort) { // make copy of the input diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 1cd60aceb..8eb18762d 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -380,8 +380,10 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, { size_t volume = local_sorted.size; - // collect local samples - size_t num_local_samples = num_ranks; // handle case numRanks > volume!! + // collect local samples - for now we take num_ranks samples for every node + // worst case this leads to 2*N/ranks elements on a single node + size_t num_local_samples = num_ranks; + size_t num_global_samples = num_local_samples * num_ranks; auto samples = create_buffer>(num_global_samples, Memory::GPU_FB_MEM); @@ -452,9 +454,6 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, size_send[num_ranks - 1] = volume - last_position; } - // need to sync as we share values in between host/device - CHECK_CUDA(cudaStreamSynchronize(stream)); - // all2all exchange send/receive sizes auto size_recv = create_buffer(num_ranks, Memory::Z_COPY_MEM); CHECK_NCCL(ncclGroupStart()); diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 192a75333..53ec8f503 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -71,7 +71,7 @@ struct SortImplBody { { auto input = input_array.read_accessor(rect); assert(input.accessor.is_dense_row_major(rect)); - assert(!is_index_space || DIM > 1); // not implemented for now + assert(!is_index_space || DIM > 1); if (argsort) { // make copy of the input diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 593b7cc21..75472925c 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -53,11 +53,21 @@ struct SortImpl { * 1. Sort is always requested for the 'last' dimension within rect * 2. We have product_of_all_other_dimensions independent sort ranges * 3. if we have more than one participants: - * a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- (only implemented for - * GPU) b) ND-case: rect needs to be the full domain in that last dimension + * a) 1D-case: we perform parallel sort (via sampling) -- (only implemented for GPU) + * b) ND-case: rect needs to be the full domain in that last dimension * */ +#ifdef DEBUG_CUNUMERIC + std::cout << typeid(KIND).name() << "(" << args.local_rank << "/" << args.num_ranks + << "): volume = " << volume << ", DIM=" << DIM << ", rect=" << rect + << ", dist. = " << args.is_index_space << ", stable. = " << args.stable + << ", argsort. = " << args.argsort << std::endl; + + assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && + "multi-dimensional array should not be distributed in (sort) dimension"); +#endif + // we shall not return on empty rectangle in case of distributed data // as the process might still participate in the parallel sort if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; From d945468e4753b4434a94bf78b162d9c7d7049838 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 12:33:37 -0800 Subject: [PATCH 36/49] argsort also allows non-stable sort --- src/cunumeric/sort/sort.cc | 16 ++++++++---- src/cunumeric/sort/sort.cu | 38 ++++++++++++++++++++-------- src/cunumeric/sort/sort_omp.cc | 16 ++++++++---- src/cunumeric/sort/sort_template.inl | 12 +-------- 4 files changed, 51 insertions(+), 31 deletions(-) diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 137c4ea21..77eea8456 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -35,7 +35,8 @@ struct SortImplBody { void thrust_local_sort_inplace(VAL* inptr, int64_t* argptr, const size_t volume, - const size_t sort_dim_size) + const size_t sort_dim_size, + const bool stable_argsort) { if (argptr == nullptr) { // sort (in place) @@ -48,8 +49,13 @@ struct SortImplBody { int64_t* segmentValues = argptr + start_idx; VAL* segmentKeys = inptr + start_idx; std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init - thrust::stable_sort_by_key( - thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + if (stable_argsort) { + thrust::stable_sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + } else { + thrust::sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + } } } } @@ -83,7 +89,7 @@ struct SortImplBody { // sort data in place thrust_local_sort_inplace( - dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size); + dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size, stable); } else { AccessorWO output = output_array.write_accessor(rect); @@ -93,7 +99,7 @@ struct SortImplBody { std::copy(src, src + volume, output.ptr(rect.lo)); // sort data in place - thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size); + thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable); } } }; diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 8eb18762d..0b1f1a0c8 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -200,8 +200,12 @@ void cub_local_sort_inplace( } template -void thrust_local_sort_inplace( - VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) +void thrust_local_sort_inplace(VAL* inptr, + int64_t* argptr, + const size_t volume, + const size_t sort_dim_size, + const bool stable_argsort, + cudaStream_t stream) { if (argptr == nullptr) { if (volume == sort_dim_size) { @@ -224,7 +228,11 @@ void thrust_local_sort_inplace( } } else { if (volume == sort_dim_size) { - thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); + if (stable_argsort) { + thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); + } else { + thrust::sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); + } } else { auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); // init combined keys @@ -236,11 +244,19 @@ void thrust_local_sort_inplace( thrust::divides()); auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); - thrust::stable_sort_by_key(thrust::cuda::par.on(stream), - combined, - combined + volume, - argptr, - thrust::less>()); + if (stable_argsort) { + thrust::stable_sort_by_key(thrust::cuda::par.on(stream), + combined, + combined + volume, + argptr, + thrust::less>()); + } else { + thrust::sort_by_key(thrust::cuda::par.on(stream), + combined, + combined + volume, + argptr, + thrust::less>()); + } } } } @@ -260,6 +276,7 @@ void local_sort_inplace(legate_type_of* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, + const bool stable_argsort, // cub sort is always stable cudaStream_t stream) { using VAL = legate_type_of; @@ -271,10 +288,11 @@ void local_sort_inplace(legate_type_of* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, + const bool stable_argsort, cudaStream_t stream) { using VAL = legate_type_of; - thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stable_argsort, stream); } // auto align to multiples of 16 bytes @@ -630,7 +648,7 @@ struct SortImplBody { values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); // sort data (locally) - local_sort_inplace(values_ptr, indices_ptr, volume, sort_dim_size, stream); + local_sort_inplace(values_ptr, indices_ptr, volume, sort_dim_size, stable, stream); } // this is linked to the decision in sorting.py on when to use an 'unbounded' output array. diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index 53ec8f503..c552fcb90 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -35,7 +35,8 @@ struct SortImplBody { void thrust_local_sort_inplace(VAL* inptr, int64_t* argptr, const size_t volume, - const size_t sort_dim_size) + const size_t sort_dim_size, + const bool stable_argsort) { if (argptr == nullptr) { // sort (in place) @@ -50,8 +51,13 @@ struct SortImplBody { int64_t* segmentValues = argptr + start_idx; VAL* segmentKeys = inptr + start_idx; std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init - thrust::stable_sort_by_key( - thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + if (stable_argsort) { + thrust::stable_sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + } else { + thrust::sort_by_key( + thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + } } } } @@ -85,7 +91,7 @@ struct SortImplBody { // sort data in place thrust_local_sort_inplace( - dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size); + dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size, stable); } else { AccessorWO output = output_array.write_accessor(rect); @@ -95,7 +101,7 @@ struct SortImplBody { std::copy(src, src + volume, output.ptr(rect.lo)); // sort data in place - thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size); + thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable); } } }; diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 75472925c..610989220 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -53,21 +53,11 @@ struct SortImpl { * 1. Sort is always requested for the 'last' dimension within rect * 2. We have product_of_all_other_dimensions independent sort ranges * 3. if we have more than one participants: - * a) 1D-case: we perform parallel sort (via sampling) -- (only implemented for GPU) + * a) 1D-case: we perform parallel sort (via sampling) * b) ND-case: rect needs to be the full domain in that last dimension * */ -#ifdef DEBUG_CUNUMERIC - std::cout << typeid(KIND).name() << "(" << args.local_rank << "/" << args.num_ranks - << "): volume = " << volume << ", DIM=" << DIM << ", rect=" << rect - << ", dist. = " << args.is_index_space << ", stable. = " << args.stable - << ", argsort. = " << args.argsort << std::endl; - - assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && - "multi-dimensional array should not be distributed in (sort) dimension"); -#endif - // we shall not return on empty rectangle in case of distributed data // as the process might still participate in the parallel sort if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; From 898a8d2132ba02d9599c103251b523dd15efdea7 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 14:00:15 -0800 Subject: [PATCH 37/49] adjusted more tests to force stable sort when comparing argsort results --- tests/sort.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/sort.py b/tests/sort.py index 4937de7ec..d662183a7 100644 --- a/tests/sort.py +++ b/tests/sort.py @@ -31,10 +31,13 @@ def test_sort_axis(a_np, a_num, axis): compare_assert(a_np, a_num) print("Sorting axis " + str(axis) + ":") sort_np = np.sort(a_np, axis, kind="stable") + sort_num = num.sort(a_num, axis, kind="stable") + compare_assert(sort_np, sort_num) + sort_np = np.sort(a_np, axis) sort_num = num.sort(a_num, axis) compare_assert(sort_np, sort_num) argsort_np = np.argsort(a_np, axis, kind="stable") - argsort_num = num.argsort(a_num, axis) + argsort_num = num.argsort(a_num, axis, kind="stable") compare_assert(argsort_np, argsort_num) @@ -118,6 +121,10 @@ def test_api(a=None): np.sort(a, axis=i, kind="stable"), num.sort(a_num, i, kind="stable"), ) + compare_assert( + np.sort(a, axis=i), + num.sort(a_num, i), + ) # flatten print("sort flattened") @@ -125,6 +132,10 @@ def test_api(a=None): np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None, kind="stable"), ) + compare_assert( + np.sort(a, axis=None), + num.sort(a_num, axis=None), + ) # msort print("msort") @@ -149,6 +160,7 @@ def test_api(a=None): np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i, kind="stable"), ) + num.argsort(a_num, axis=i) # cannot be compared # flatten print("argsort flattened") @@ -156,6 +168,7 @@ def test_api(a=None): np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None, kind="stable"), ) + num.argsort(a_num, axis=None) # cannot be compared def generate_random(shape, datatype): From 09ac1c872a467b5746a986f8eb929a354401e15c Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 9 Mar 2022 14:40:36 -0800 Subject: [PATCH 38/49] clarify offset iterator usage --- src/cunumeric/sort/sort.cu | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 0b1f1a0c8..33ae367c0 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -81,7 +81,7 @@ void cub_local_sort_inplace( size_t temp_storage_bytes = 0; if (argptr == nullptr) { if (volume == sort_dim_size) { - // sort (initial call to compute bufffer size) + // sort (initial call to compute buffer size) cub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream); auto temp_storage = @@ -95,10 +95,11 @@ void cub_local_sort_inplace( sizeof(VAL) * 8, stream); } else { - // segmented sort (initial call to compute bufffer size) - auto off_start_it = + // segmented sort (initial call to compute buffer size) + // generate start/end positions for all segments via iterators to avoid allocating buffers + auto off_start_pos_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); - auto off_end_it = + auto off_end_pos_it = thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); cub::DeviceSegmentedRadixSort::SortKeys(nullptr, @@ -107,8 +108,8 @@ void cub_local_sort_inplace( inptr, volume, volume / sort_dim_size, - off_start_it, - off_end_it, + off_start_pos_it, + off_end_pos_it, 0, sizeof(VAL) * 8, stream); @@ -121,8 +122,8 @@ void cub_local_sort_inplace( inptr, volume, volume / sort_dim_size, - off_start_it, - off_end_it, + off_start_pos_it, + off_end_pos_it, 0, sizeof(VAL) * 8, stream); @@ -133,7 +134,7 @@ void cub_local_sort_inplace( idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); if (volume == sort_dim_size) { - // argsort (initial call to compute bufffer size) + // argsort (initial call to compute buffer size) cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_in.ptr(0), @@ -159,10 +160,11 @@ void cub_local_sort_inplace( sizeof(VAL) * 8, stream); } else { - // segmented argsort (initial call to compute bufffer size) - auto off_start_it = + // segmented argsort (initial call to compute buffer size) + // generate start/end positions for all segments via iterators to avoid allocating buffers + auto off_start_pos_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size)); - auto off_end_it = + auto off_end_pos_it = thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size)); cub::DeviceSegmentedRadixSort::SortPairs(nullptr, @@ -173,8 +175,8 @@ void cub_local_sort_inplace( argptr, volume, volume / sort_dim_size, - off_start_it, - off_end_it, + off_start_pos_it, + off_end_pos_it, 0, sizeof(VAL) * 8, stream); @@ -190,8 +192,8 @@ void cub_local_sort_inplace( argptr, volume, volume / sort_dim_size, - off_start_it, - off_end_it, + off_start_pos_it, + off_end_pos_it, 0, sizeof(VAL) * 8, stream); From 9cd31bb0161f1accc3fa711c80f6ef913a1621fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Mar 2022 09:35:37 +0000 Subject: [PATCH 39/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/cunumeric/mapper.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index a769bd0fc..fd1bfec2f 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -125,20 +125,20 @@ std::vector CuNumericMapper::store_mappings( mappings.back().policy.ordering.fortran_order(); mappings.back().policy.exact = true; return std::move(mappings); - case CUNUMERIC_SORT: { - std::vector mappings; - auto& inputs = task.inputs(); - mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front())); - mappings.back().policy.ordering.c_order(); - mappings.back().policy.exact = true; - return std::move(mappings); + case CUNUMERIC_SORT: { + std::vector mappings; + auto& inputs = task.inputs(); + mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front())); + mappings.back().policy.ordering.c_order(); + mappings.back().policy.exact = true; + return std::move(mappings); + } + default: { + return {}; + } } - default: { + assert(false); return {}; - } } - assert(false); - return {}; -} } // namespace cunumeric From da79f8600fbf7b42bb622838379657a504048d3d Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 10 Mar 2022 01:45:55 -0800 Subject: [PATCH 40/49] fixed merge conflict --- src/cunumeric/mapper.cc | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index fd1bfec2f..962e68ecc 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -125,20 +125,21 @@ std::vector CuNumericMapper::store_mappings( mappings.back().policy.ordering.fortran_order(); mappings.back().policy.exact = true; return std::move(mappings); - case CUNUMERIC_SORT: { - std::vector mappings; - auto& inputs = task.inputs(); - mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front())); - mappings.back().policy.ordering.c_order(); - mappings.back().policy.exact = true; - return std::move(mappings); - } - default: { - return {}; - } } - assert(false); + case CUNUMERIC_SORT: { + std::vector mappings; + auto& inputs = task.inputs(); + mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front())); + mappings.back().policy.ordering.c_order(); + mappings.back().policy.exact = true; + return std::move(mappings); + } + default: { return {}; + } } + assert(false); + return {}; +} } // namespace cunumeric From 04f811beca317aca2f729f0d0ed40d9ee8277da7 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 11 Mar 2022 05:01:21 -0800 Subject: [PATCH 41/49] ensure 16byte alignment for NCCL transfers --- src/cunumeric/sort/sort.cu | 93 +++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 33ae367c0..2701e188c 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -298,7 +298,10 @@ void local_sort_inplace(legate_type_of* inptr, } // auto align to multiples of 16 bytes -auto get_aligned_size = [](auto size) { return std::max(16, (size + 15) / 16 * 16); }; +auto get_16b_aligned = [](auto bytes) { return std::max(16, (bytes + 15) / 16 * 16); }; +auto get_16b_aligned_count = [](auto count, auto element_bytes) { + return (get_16b_aligned(count * element_bytes) + element_bytes - 1) / element_bytes; +}; template struct SortPiece { @@ -488,40 +491,96 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, // allocate merge targets, data transfer... std::vector> merge_buffers(num_ranks); + std::vector aligned_pos_vals_send(num_ranks); + std::vector aligned_pos_idcs_send(num_ranks); + size_t buf_size_send_vals_total = 0; + size_t buf_size_send_idcs_total = 0; for (size_t i = 0; i < num_ranks; ++i) { // align buffer to allow data transfer of 16byte blocks - auto recv_size_aligned = get_aligned_size(size_recv[i] * sizeof(VAL)); - auto buf_size = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL); - merge_buffers[i].values = create_buffer(buf_size, Memory::GPU_FB_MEM); - merge_buffers[i].indices = create_buffer(argsort ? buf_size : 0, Memory::GPU_FB_MEM); - merge_buffers[i].size = size_recv[i]; + auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL)); + merge_buffers[i].values = create_buffer(buf_size_vals_recv, Memory::GPU_FB_MEM); + merge_buffers[i].size = size_recv[i]; + + aligned_pos_vals_send[i] = buf_size_send_vals_total; + buf_size_send_vals_total += get_16b_aligned_count(size_send[i], sizeof(VAL)); + + if (argsort) { + auto buf_size_idcs_recv = get_16b_aligned_count(size_recv[i], sizeof(int64_t)); + merge_buffers[i].indices = create_buffer(buf_size_idcs_recv, Memory::GPU_FB_MEM); + aligned_pos_idcs_send[i] = buf_size_send_idcs_total; + buf_size_send_idcs_total += get_16b_aligned_count(size_send[i], sizeof(int64_t)); + } else { + merge_buffers[i].indices = create_buffer(0, Memory::GPU_FB_MEM); + } + } + + // copy values into aligned send buffer + auto val_send_buf = local_sorted.values; + if (buf_size_send_vals_total > volume) { + val_send_buf = create_buffer(buf_size_send_vals_total, Memory::GPU_FB_MEM); + size_t pos = 0; + for (size_t r = 0; r < num_ranks; ++r) { + CHECK_CUDA(cudaMemcpyAsync(val_send_buf.ptr(aligned_pos_vals_send[r]), + local_sorted.values.ptr(pos), + sizeof(VAL) * size_send[r], + cudaMemcpyDeviceToDevice, + stream)); + pos += size_send[r]; + } } - size_t send_pos = 0; + + // copy indices into aligned send buffer + auto idc_send_buf = local_sorted.indices; + if (argsort && buf_size_send_idcs_total > volume) { + idc_send_buf = create_buffer(buf_size_send_idcs_total, Memory::GPU_FB_MEM); + size_t pos = 0; + for (size_t r = 0; r < num_ranks; ++r) { + CHECK_CUDA(cudaMemcpyAsync(idc_send_buf.ptr(aligned_pos_idcs_send[r]), + local_sorted.indices.ptr(pos), + sizeof(int64_t) * size_send[r], + cudaMemcpyDeviceToDevice, + stream)); + pos += size_send[r]; + } + } + CHECK_NCCL(ncclGroupStart()); for (size_t r = 0; r < num_ranks; r++) { - CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos), - get_aligned_size(size_send[r] * sizeof(VAL)), + CHECK_NCCL(ncclSend(val_send_buf.ptr(aligned_pos_vals_send[r]), + get_16b_aligned(size_send[r] * sizeof(VAL)), ncclInt8, r, *comm, stream)); CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0), - get_aligned_size(size_recv[r] * sizeof(VAL)), + get_16b_aligned(size_recv[r] * sizeof(VAL)), ncclInt8, r, *comm, stream)); - if (argsort) { - CHECK_NCCL( - ncclSend(local_sorted.indices.ptr(send_pos), size_send[r], ncclInt64, r, *comm, stream)); - CHECK_NCCL( - ncclRecv(merge_buffers[r].indices.ptr(0), size_recv[r], ncclInt64, r, *comm, stream)); - } - send_pos += size_send[r]; } CHECK_NCCL(ncclGroupEnd()); + if (argsort) { + CHECK_NCCL(ncclGroupStart()); + for (size_t r = 0; r < num_ranks; r++) { + CHECK_NCCL(ncclSend(idc_send_buf.ptr(aligned_pos_idcs_send[r]), + get_16b_aligned_count(size_send[r], sizeof(int64_t)), + ncclInt64, + r, + *comm, + stream)); + CHECK_NCCL(ncclRecv(merge_buffers[r].indices.ptr(0), + get_16b_aligned_count(size_recv[r], sizeof(int64_t)), + ncclInt64, + r, + *comm, + stream)); + } + CHECK_NCCL(ncclGroupEnd()); + } + // now merge sort all into the result buffer // maybe k-way merge is more efficient here... for (size_t stride = 1; stride < num_ranks; stride *= 2) { From 568523fcb2a9e3116d458287ba7ac783f3b5b752 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 16 Mar 2022 02:33:15 -0700 Subject: [PATCH 42/49] some minor adjustments --- cunumeric/deferred.py | 4 ++-- cunumeric/{sorting.py => sort.py} | 4 ++-- examples/sort.py | 24 ++++++++---------------- src/cunumeric/sort/sort.cu | 8 ++++++-- src/cunumeric/sort/sort_template.inl | 3 +++ 5 files changed, 21 insertions(+), 22 deletions(-) rename cunumeric/{sorting.py => sort.py} (96%) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index fa9d18ef1..57327aeaf 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -32,7 +32,7 @@ UnaryRedCode, ) from .linalg.cholesky import cholesky -from .sorting import sorting +from .sort import sort from .thunk import NumPyThunk from .utils import get_arg_value_dtype @@ -1559,4 +1559,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None): if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim): raise ValueError("invalid axis") - sorting(self, rhs, argsort, axis, stable) + sort(self, rhs, argsort, axis, stable) diff --git a/cunumeric/sorting.py b/cunumeric/sort.py similarity index 96% rename from cunumeric/sorting.py rename to cunumeric/sort.py index f8c56fa0b..b2b8bb43d 100644 --- a/cunumeric/sorting.py +++ b/cunumeric/sort.py @@ -68,9 +68,9 @@ def sort_task(output, input, argsort, stable): if output.ndim > 1: task.add_broadcast(input.base, input.ndim - 1) - elif output.runtime.num_gpus > 0: + elif output.runtime.num_gpus > 1: task.add_nccl_communicator() - elif output.runtime.num_procs > 1: + elif output.runtime.num_gpus == 0 and output.runtime.num_procs > 1: # Distributed 1D sort on CPU not supported yet task.add_broadcast(input.base) diff --git a/examples/sort.py b/examples/sort.py index 6713da232..179cc223d 100644 --- a/examples/sort.py +++ b/examples/sort.py @@ -37,14 +37,15 @@ def check_sorted(a, a_sorted, axis=-1): assert False -def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): +def run_sort(shape, axis, datatype, lower, upper, perform_check, timing): num.random.seed(42) newtype = np.dtype(datatype).type - if shape is not None: - shape = tuple(shape) - else: - shape = (N,) + + N = 1 + for e in shape: + N *= e + shape = tuple(shape) if np.issubdtype(newtype, np.integer): if lower is None: @@ -87,14 +88,6 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): action="store_true", help="check the result of the solve", ) - parser.add_argument( - "-n", - "--num", - type=int, - default=1000000, - dest="N", - help="number of elements in one dimension", - ) parser.add_argument( "-t", "--time", @@ -107,9 +100,9 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): "--shape", type=int, nargs="+", - default=None, + default=[1000000], dest="shape", - help="array reshape (default 'None')", + help="array reshape (default '[1000000]')", ) parser.add_argument( "-d", @@ -159,7 +152,6 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing): args.benchmark, "Sort", ( - args.N, args.shape, args.axis, args.datatype, diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 2701e188c..3dc268bdd 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -282,7 +282,12 @@ void local_sort_inplace(legate_type_of* inptr, cudaStream_t stream) { using VAL = legate_type_of; - cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + // fallback to thrust approach as segmented radix sort is not suited for small segments + if (volume == sort_dim_size || sort_dim_size > 300) { + cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + } else { + thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stable_argsort, stream); + } } template ::value>* = nullptr> @@ -450,7 +455,6 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, auto split_positions = create_buffer(num_splitters, Memory::Z_COPY_MEM); { const size_t num_blocks = (num_splitters + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - VAL init_value = std::numeric_limits::max(); extract_split_positions<<>>( local_sorted.values.ptr(0), volume, diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl index 610989220..5c63813eb 100644 --- a/src/cunumeric/sort/sort_template.inl +++ b/src/cunumeric/sort/sort_template.inl @@ -58,6 +58,9 @@ struct SortImpl { * */ + assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) && + "multi-dimensional array should not be distributed in (sort) dimension"); + // we shall not return on empty rectangle in case of distributed data // as the process might still participate in the parallel sort if ((DIM > 1 || !args.is_index_space) && rect.empty()) return; From e1b6c3182b8c5ef2744aedffdce993d9156deb13 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 16 Mar 2022 14:04:38 -0700 Subject: [PATCH 43/49] fixed renaming --- cunumeric/sort.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cunumeric/sort.py b/cunumeric/sort.py index b2b8bb43d..f1def3c72 100644 --- a/cunumeric/sort.py +++ b/cunumeric/sort.py @@ -26,7 +26,7 @@ def sort_flattened(output, input, argsort, stable): sort_result = output.runtime.create_empty_thunk( flattened.shape, dtype=output.dtype, inputs=(flattened,) ) - sorting(sort_result, flattened, argsort, stable=stable) + sort(sort_result, flattened, argsort, stable=stable) output.base = sort_result.base output.numpy_array = None @@ -46,7 +46,7 @@ def sort_swapped(output, input, argsort, sort_axis, stable): sort_result = output.runtime.create_empty_thunk( swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,) ) - sorting(sort_result, swapped_copy, argsort, stable=stable) + sort(sort_result, swapped_copy, argsort, stable=stable) output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base output.numpy_array = None @@ -84,7 +84,7 @@ def sort_task(output, input, argsort, stable): output.numpy_array = None -def sorting(output, input, argsort, axis=-1, stable=False): +def sort(output, input, argsort, axis=-1, stable=False): if axis is None and input.ndim > 1: sort_flattened(output, input, argsort, stable) else: From 2edd7ba4bd35a48831f50b294a8c6fd6e685f995 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Mar 2022 03:57:22 -0700 Subject: [PATCH 44/49] manually free temporary memory to reduce peak usage --- src/cunumeric/sort/sort.cu | 89 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 3dc268bdd..da28cd889 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -34,34 +34,6 @@ namespace cunumeric { using namespace Legion; -template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - copy_into_buffer(VAL* out, - const AccessorRO accessor, - const Point lo, - const Pitches pitches, - const size_t volume) -{ - size_t offset = blockIdx.x * blockDim.x + threadIdx.x; - if (offset >= volume) return; - auto point = pitches.unflatten(offset, lo); - out[offset] = accessor[lo + point]; -} - -template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - copy_into_output(AccessorWO accessor, - const VAL* data, - const Point lo, - const Pitches pitches, - const size_t volume) -{ - size_t offset = blockIdx.x * blockDim.x + threadIdx.x; - if (offset >= volume) return; - auto point = pitches.unflatten(offset, lo); - accessor[lo + point] = data[offset]; -} - struct multiply : public thrust::unary_function { const int constant; @@ -94,6 +66,7 @@ void cub_local_sort_inplace( 0, sizeof(VAL) * 8, stream); + temp_storage.destroy(); } else { // segmented sort (initial call to compute buffer size) // generate start/end positions for all segments via iterators to avoid allocating buffers @@ -127,6 +100,7 @@ void cub_local_sort_inplace( 0, sizeof(VAL) * 8, stream); + temp_storage.destroy(); } } else { auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); @@ -159,6 +133,7 @@ void cub_local_sort_inplace( 0, sizeof(VAL) * 8, stream); + temp_storage.destroy(); } else { // segmented argsort (initial call to compute buffer size) // generate start/end positions for all segments via iterators to avoid allocating buffers @@ -197,8 +172,11 @@ void cub_local_sort_inplace( 0, sizeof(VAL) * 8, stream); + temp_storage.destroy(); } + idx_in.destroy(); } + keys_in.destroy(); } template @@ -227,6 +205,8 @@ void thrust_local_sort_inplace(VAL* inptr, combined, combined + volume, thrust::less>()); + + sort_id.destroy(); } } else { if (volume == sort_dim_size) { @@ -259,6 +239,8 @@ void thrust_local_sort_inplace(VAL* inptr, argptr, thrust::less>()); } + + sort_id.destroy(); } } } @@ -481,6 +463,10 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, size_send[num_ranks - 1] = volume - last_position; } + // cleanup intermediate data structures + samples.destroy(); + split_positions.destroy(); + // all2all exchange send/receive sizes auto size_recv = create_buffer(num_ranks, Memory::Z_COPY_MEM); CHECK_NCCL(ncclGroupStart()); @@ -493,29 +479,18 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, // need to sync as we share values in between host/device CHECK_CUDA(cudaStreamSynchronize(stream)); - // allocate merge targets, data transfer... - std::vector> merge_buffers(num_ranks); + // handle alignment std::vector aligned_pos_vals_send(num_ranks); std::vector aligned_pos_idcs_send(num_ranks); - size_t buf_size_send_vals_total = 0; size_t buf_size_send_idcs_total = 0; for (size_t i = 0; i < num_ranks; ++i) { // align buffer to allow data transfer of 16byte blocks - auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL)); - merge_buffers[i].values = create_buffer(buf_size_vals_recv, Memory::GPU_FB_MEM); - merge_buffers[i].size = size_recv[i]; - aligned_pos_vals_send[i] = buf_size_send_vals_total; buf_size_send_vals_total += get_16b_aligned_count(size_send[i], sizeof(VAL)); - if (argsort) { - auto buf_size_idcs_recv = get_16b_aligned_count(size_recv[i], sizeof(int64_t)); - merge_buffers[i].indices = create_buffer(buf_size_idcs_recv, Memory::GPU_FB_MEM); aligned_pos_idcs_send[i] = buf_size_send_idcs_total; buf_size_send_idcs_total += get_16b_aligned_count(size_send[i], sizeof(int64_t)); - } else { - merge_buffers[i].indices = create_buffer(0, Memory::GPU_FB_MEM); } } @@ -532,6 +507,7 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, stream)); pos += size_send[r]; } + local_sorted.values.destroy(); } // copy indices into aligned send buffer @@ -547,6 +523,21 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, stream)); pos += size_send[r]; } + local_sorted.indices.destroy(); + } + + // allocate target buffers + std::vector> merge_buffers(num_ranks); + for (size_t i = 0; i < num_ranks; ++i) { + auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL)); + merge_buffers[i].values = create_buffer(buf_size_vals_recv, Memory::GPU_FB_MEM); + merge_buffers[i].size = size_recv[i]; + if (argsort) { + auto buf_size_idcs_recv = get_16b_aligned_count(size_recv[i], sizeof(int64_t)); + merge_buffers[i].indices = create_buffer(buf_size_idcs_recv, Memory::GPU_FB_MEM); + } else { + merge_buffers[i].indices = create_buffer(0, Memory::GPU_FB_MEM); + } } CHECK_NCCL(ncclGroupStart()); @@ -585,6 +576,12 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, CHECK_NCCL(ncclGroupEnd()); } + // cleanup remaining buffers + size_send.destroy(); + size_recv.destroy(); + val_send_buf.destroy(); + idc_send_buf.destroy(); + // now merge sort all into the result buffer // maybe k-way merge is more efficient here... for (size_t stride = 1; stride < num_ranks; stride *= 2) { @@ -664,14 +661,14 @@ struct SortImplBody { VAL* values_ptr = nullptr; if (argsort) { // make a buffer for input - auto input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - values_ptr = input_copy.ptr(0); + auto input_copy = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + local_sorted.values = input_copy; + values_ptr = input_copy.ptr(0); // initialize indices if (output_array.dim() == -1) { auto indices_buffer = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); indices_ptr = indices_buffer.ptr(0); - local_sorted.values = input_copy; local_sorted.indices = indices_buffer; local_sorted.size = volume; } else { @@ -699,8 +696,7 @@ struct SortImplBody { values_ptr = input_copy.ptr(0); local_sorted.values = input_copy; local_sorted.indices = create_buffer(0, Legion::Memory::Kind::GPU_FB_MEM); - ; - local_sorted.size = volume; + local_sorted.size = volume; } else { AccessorWO output = output_array.write_accessor(rect); assert(output.accessor.is_dense_row_major(rect)); @@ -730,6 +726,9 @@ struct SortImplBody { output_array.return_data(local_sorted_repartitioned.values, local_sorted_repartitioned.size); } + } else if (argsort) { + // cleanup + local_sorted.values.destroy(); } } }; From ee52211deb3ab19b1948b9b9b721d2efa59c3085 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Mar 2022 05:13:56 -0700 Subject: [PATCH 45/49] refactor sort interface to prevent 1 unneeded copy --- src/cunumeric/sort/sort.cu | 181 ++++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 65 deletions(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index da28cd889..e4e5b2e22 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -43,25 +43,35 @@ struct multiply : public thrust::unary_function { }; template -void cub_local_sort_inplace( - VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream) +void cub_local_sort(const VAL* values_in, + VAL* values_out, + const int64_t* indices_in, + int64_t* indices_out, + const size_t volume, + const size_t sort_dim_size, + cudaStream_t stream) { - // make a copy of input --> we want inptr to return sorted values - auto keys_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - CHECK_CUDA( - cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); + Buffer keys_in; + const VAL* values_in_cub = values_in; + if (values_in == values_out) { + keys_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + values_in_cub = keys_in.ptr(0); + CHECK_CUDA(cudaMemcpyAsync( + keys_in.ptr(0), values_out, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); + } + size_t temp_storage_bytes = 0; - if (argptr == nullptr) { + if (indices_out == nullptr) { if (volume == sort_dim_size) { // sort (initial call to compute buffer size) cub::DeviceRadixSort::SortKeys( - nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream); + nullptr, temp_storage_bytes, values_in_cub, values_out, volume, 0, sizeof(VAL) * 8, stream); auto temp_storage = create_buffer(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM); cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, - keys_in.ptr(0), - inptr, + values_in_cub, + values_out, volume, 0, sizeof(VAL) * 8, @@ -77,8 +87,8 @@ void cub_local_sort_inplace( cub::DeviceSegmentedRadixSort::SortKeys(nullptr, temp_storage_bytes, - keys_in.ptr(0), - inptr, + values_in_cub, + values_out, volume, volume / sort_dim_size, off_start_pos_it, @@ -91,8 +101,8 @@ void cub_local_sort_inplace( cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, - keys_in.ptr(0), - inptr, + values_in_cub, + values_out, volume, volume / sort_dim_size, off_start_pos_it, @@ -103,18 +113,23 @@ void cub_local_sort_inplace( temp_storage.destroy(); } } else { - auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); - CHECK_CUDA(cudaMemcpyAsync( - idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); + Buffer idx_in; + const int64_t* indices_in_cub = indices_in; + if (indices_in == indices_out) { + auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + indices_in_cub = idx_in.ptr(0); + CHECK_CUDA(cudaMemcpyAsync( + idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); + } if (volume == sort_dim_size) { // argsort (initial call to compute buffer size) cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, - keys_in.ptr(0), - inptr, - idx_in.ptr(0), - argptr, + values_in_cub, + values_out, + indices_in_cub, + indices_out, volume, 0, sizeof(VAL) * 8, @@ -125,10 +140,10 @@ void cub_local_sort_inplace( cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0), temp_storage_bytes, - keys_in.ptr(0), - inptr, - idx_in.ptr(0), - argptr, + values_in_cub, + values_out, + indices_in_cub, + indices_out, volume, 0, sizeof(VAL) * 8, @@ -144,10 +159,10 @@ void cub_local_sort_inplace( cub::DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, - keys_in.ptr(0), - inptr, - idx_in.ptr(0), - argptr, + values_in_cub, + values_out, + indices_in_cub, + indices_out, volume, volume / sort_dim_size, off_start_pos_it, @@ -161,10 +176,10 @@ void cub_local_sort_inplace( cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0), temp_storage_bytes, - keys_in.ptr(0), - inptr, - idx_in.ptr(0), - argptr, + values_in_cub, + values_out, + indices_in_cub, + indices_out, volume, volume / sort_dim_size, off_start_pos_it, @@ -174,22 +189,36 @@ void cub_local_sort_inplace( stream); temp_storage.destroy(); } - idx_in.destroy(); + if (indices_in == indices_out) idx_in.destroy(); } - keys_in.destroy(); + + if (values_in == values_out) keys_in.destroy(); } template -void thrust_local_sort_inplace(VAL* inptr, - int64_t* argptr, - const size_t volume, - const size_t sort_dim_size, - const bool stable_argsort, - cudaStream_t stream) +void thrust_local_sort(const VAL* values_in, + VAL* values_out, + const int64_t* indices_in, + int64_t* indices_out, + const size_t volume, + const size_t sort_dim_size, + const bool stable_argsort, + cudaStream_t stream) { - if (argptr == nullptr) { + if (values_in != values_out) { + // not in-place --> need a copy + CHECK_CUDA(cudaMemcpyAsync( + values_out, values_in, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); + } + if (indices_in != indices_out) { + // not in-place --> need a copy + CHECK_CUDA(cudaMemcpyAsync( + indices_out, values_in, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); + } + + if (indices_out == nullptr) { if (volume == sort_dim_size) { - thrust::sort(thrust::cuda::par.on(stream), inptr, inptr + volume); + thrust::sort(thrust::cuda::par.on(stream), values_out, values_out + volume); } else { auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); // init combined keys @@ -199,7 +228,7 @@ void thrust_local_sort_inplace(VAL* inptr, thrust::make_constant_iterator(sort_dim_size), sort_id.ptr(0), thrust::divides()); - auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); + auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), values_out)); thrust::sort(thrust::cuda::par.on(stream), combined, @@ -211,9 +240,11 @@ void thrust_local_sort_inplace(VAL* inptr, } else { if (volume == sort_dim_size) { if (stable_argsort) { - thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); + thrust::stable_sort_by_key( + thrust::cuda::par.on(stream), values_out, values_out + volume, indices_out); } else { - thrust::sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr); + thrust::sort_by_key( + thrust::cuda::par.on(stream), values_out, values_out + volume, indices_out); } } else { auto sort_id = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); @@ -224,19 +255,19 @@ void thrust_local_sort_inplace(VAL* inptr, thrust::make_constant_iterator(sort_dim_size), sort_id.ptr(0), thrust::divides()); - auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr)); + auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), values_out)); if (stable_argsort) { thrust::stable_sort_by_key(thrust::cuda::par.on(stream), combined, combined + volume, - argptr, + indices_out, thrust::less>()); } else { thrust::sort_by_key(thrust::cuda::par.on(stream), combined, combined + volume, - argptr, + indices_out, thrust::less>()); } @@ -256,32 +287,45 @@ struct support_cub : std::false_type { }; template ::value>* = nullptr> -void local_sort_inplace(legate_type_of* inptr, - int64_t* argptr, - const size_t volume, - const size_t sort_dim_size, - const bool stable_argsort, // cub sort is always stable - cudaStream_t stream) +void local_sort(const legate_type_of* values_in, + legate_type_of* values_out, + const int64_t* indices_in, + int64_t* indices_out, + const size_t volume, + const size_t sort_dim_size, + const bool stable_argsort, // cub sort is always stable + cudaStream_t stream) { using VAL = legate_type_of; // fallback to thrust approach as segmented radix sort is not suited for small segments if (volume == sort_dim_size || sort_dim_size > 300) { - cub_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stream); + cub_local_sort( + values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream); } else { - thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stable_argsort, stream); + thrust_local_sort(values_in, + values_out, + indices_in, + indices_out, + volume, + sort_dim_size, + stable_argsort, + stream); } } template ::value>* = nullptr> -void local_sort_inplace(legate_type_of* inptr, - int64_t* argptr, - const size_t volume, - const size_t sort_dim_size, - const bool stable_argsort, - cudaStream_t stream) +void local_sort(const legate_type_of* values_in, + legate_type_of* values_out, + const int64_t* indices_in, + int64_t* indices_out, + const size_t volume, + const size_t sort_dim_size, + const bool stable_argsort, + cudaStream_t stream) { using VAL = legate_type_of; - thrust_local_sort_inplace(inptr, argptr, volume, sort_dim_size, stable_argsort, stream); + thrust_local_sort( + values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable_argsort, stream); } // auto align to multiples of 16 bytes @@ -709,7 +753,14 @@ struct SortImplBody { values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); // sort data (locally) - local_sort_inplace(values_ptr, indices_ptr, volume, sort_dim_size, stable, stream); + local_sort(input.ptr(rect.lo), + values_ptr, + indices_ptr, + indices_ptr, + volume, + sort_dim_size, + stable, + stream); } // this is linked to the decision in sorting.py on when to use an 'unbounded' output array. From 927b54f2d2793de65b38340551fded3ff68eb417 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Mar 2022 17:20:35 +0000 Subject: [PATCH 46/49] fixed init issue --- src/cunumeric/sort/sort.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index e4e5b2e22..17cbadd67 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -116,7 +116,7 @@ void cub_local_sort(const VAL* values_in, Buffer idx_in; const int64_t* indices_in_cub = indices_in; if (indices_in == indices_out) { - auto idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); indices_in_cub = idx_in.ptr(0); CHECK_CUDA(cudaMemcpyAsync( idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); From 10e7ebb15b26880b343339c7da995f4737a9b650 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Mar 2022 17:23:55 +0000 Subject: [PATCH 47/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/cunumeric/sort/sort.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 17cbadd67..682b703f0 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -116,7 +116,7 @@ void cub_local_sort(const VAL* values_in, Buffer idx_in; const int64_t* indices_in_cub = indices_in; if (indices_in == indices_out) { - idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); + idx_in = create_buffer(volume, Legion::Memory::Kind::GPU_FB_MEM); indices_in_cub = idx_in.ptr(0); CHECK_CUDA(cudaMemcpyAsync( idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream)); From e52b0177980ba0f7f190b934683006cd48f04688 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 18 Mar 2022 17:38:43 +0000 Subject: [PATCH 48/49] change to thrust openmp policy --- src/cunumeric/sort/sort_omp.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index c552fcb90..c26d606a5 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -40,23 +41,21 @@ struct SortImplBody { { if (argptr == nullptr) { // sort (in place) -#pragma omp parallel for for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { - thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size); + thrust::sort(thrust::omp::par, inptr + start_idx, inptr + start_idx + sort_dim_size); } } else { // argsort -#pragma omp parallel for for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) { int64_t* segmentValues = argptr + start_idx; VAL* segmentKeys = inptr + start_idx; std::iota(segmentValues, segmentValues + sort_dim_size, 0); // init if (stable_argsort) { thrust::stable_sort_by_key( - thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + thrust::omp::par, segmentKeys, segmentKeys + sort_dim_size, segmentValues); } else { thrust::sort_by_key( - thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues); + thrust::omp::par, segmentKeys, segmentKeys + sort_dim_size, segmentValues); } } } From 99798e303a1f9a33c277cad23e6b293ac5ff11b4 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 21 Mar 2022 15:13:48 -0700 Subject: [PATCH 49/49] removed another copy on python side in case we can sort in place --- cunumeric/sort.py | 18 +++++++++++------- src/cunumeric/sort/sort.cc | 7 ++++--- src/cunumeric/sort/sort.cu | 5 ----- src/cunumeric/sort/sort_omp.cc | 7 ++++--- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/cunumeric/sort.py b/cunumeric/sort.py index f1def3c72..fca32e80d 100644 --- a/cunumeric/sort.py +++ b/cunumeric/sort.py @@ -43,13 +43,17 @@ def sort_swapped(output, input, argsort, sort_axis, stable): swapped_copy.copy(swapped, deep=True) # run sort on last axis - sort_result = output.runtime.create_empty_thunk( - swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,) - ) - sort(sort_result, swapped_copy, argsort, stable=stable) - - output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base - output.numpy_array = None + if argsort is True: + sort_result = output.runtime.create_empty_thunk( + swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,) + ) + sort(sort_result, swapped_copy, argsort, stable=stable) + output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base + output.numpy_array = None + else: + sort(swapped_copy, swapped_copy, argsort, stable=stable) + output.base = swapped_copy.swapaxes(input.ndim - 1, sort_axis).base + output.numpy_array = None def sort_task(output, input, argsort, stable): diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc index 77eea8456..dda79d396 100644 --- a/src/cunumeric/sort/sort.cc +++ b/src/cunumeric/sort/sort.cc @@ -95,11 +95,12 @@ struct SortImplBody { AccessorWO output = output_array.write_accessor(rect); // init output values - auto* src = input.ptr(rect.lo); - std::copy(src, src + volume, output.ptr(rect.lo)); + auto* src = input.ptr(rect.lo); + auto* target = output.ptr(rect.lo); + if (src != target) std::copy(src, src + volume, target); // sort data in place - thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable); + thrust_local_sort_inplace(target, nullptr, volume, sort_dim_size, stable); } } }; diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index 682b703f0..13e632f9a 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -583,7 +583,6 @@ static SortPiece sample_sort_nccl(SortPiece local_sorted, merge_buffers[i].indices = create_buffer(0, Memory::GPU_FB_MEM); } } - CHECK_NCCL(ncclGroupStart()); for (size_t r = 0; r < num_ranks; r++) { CHECK_NCCL(ncclSend(val_send_buf.ptr(aligned_pos_vals_send[r]), @@ -747,11 +746,7 @@ struct SortImplBody { values_ptr = output.ptr(rect.lo); } } - if (volume > 0) { - CHECK_CUDA(cudaMemcpyAsync( - values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream)); - // sort data (locally) local_sort(input.ptr(rect.lo), values_ptr, diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc index c26d606a5..b3afc6019 100644 --- a/src/cunumeric/sort/sort_omp.cc +++ b/src/cunumeric/sort/sort_omp.cc @@ -96,11 +96,12 @@ struct SortImplBody { AccessorWO output = output_array.write_accessor(rect); // init output values - auto* src = input.ptr(rect.lo); - std::copy(src, src + volume, output.ptr(rect.lo)); + auto* src = input.ptr(rect.lo); + auto* target = output.ptr(rect.lo); + if (src != target) std::copy(src, src + volume, target); // sort data in place - thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable); + thrust_local_sort_inplace(target, nullptr, volume, sort_dim_size, stable); } } };