diff --git a/bitonic_sort.vcxproj b/bitonic_sort.vcxproj index 0954b91..6e54283 100644 --- a/bitonic_sort.vcxproj +++ b/bitonic_sort.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -22,31 +22,32 @@ {F89622BD-0044-4082-A6C6-F5D9678EBAE5} Win32Proj bitonic_sort + 10.0.16299.0 Application true - v120 + v141 Unicode Application true - v120 + v141 Unicode Application false - v120 + v141 true Unicode Application false - v120 + v141 true Unicode diff --git a/bitonic_sort_gpu.cpp b/bitonic_sort_gpu.cpp index 1f10413..441a6aa 100644 --- a/bitonic_sort_gpu.cpp +++ b/bitonic_sort_gpu.cpp @@ -34,6 +34,38 @@ void bitonic_512_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic512, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL); } +void bitonic_1024_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other) +{ + const int kernelSize = (a_N >> 1); + + int iSize = kernelSize; + size_t a_size = kernelSize; + size_t localWorkSize = 512; + + clSetKernelArg(other.bitonic1024, 0, sizeof(cl_mem), (void*)&a_buffer); + clSetKernelArg(other.bitonic1024, 1, sizeof(cl_int), (void*)&stage); + clSetKernelArg(other.bitonic1024, 2, sizeof(cl_int), (void*)&passOfStage); + clSetKernelArg(other.bitonic1024, 3, sizeof(cl_int), (void*)&a_invertModeOn); + + clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic1024, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL); +} + +void bitonic_2048_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other) +{ + const int kernelSize = (a_N >> 1); + + int iSize = kernelSize; + size_t a_size = kernelSize; + size_t localWorkSize = 1024; + + clSetKernelArg(other.bitonic2048, 0, sizeof(cl_mem), (void*)&a_buffer); + clSetKernelArg(other.bitonic2048, 1, sizeof(cl_int), (void*)&stage); + clSetKernelArg(other.bitonic2048, 2, sizeof(cl_int), (void*)&passOfStage); + clSetKernelArg(other.bitonic2048, 3, sizeof(cl_int), (void*)&a_invertModeOn); + + clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic2048, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL); +} + void bitonic_sort_gpu_simple(cl_mem a_data, int a_N, BitonicCLArgs other) { @@ -61,13 +93,31 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other) for (int temp = a_N; temp > 2; temp >>= 1) numStages++; + // not all devices can have large work groups! + // + size_t maxWorkGroupSize = 0; + if (other.dev != 0) + clGetDeviceInfo(other.dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); + else + maxWorkGroupSize = 256; + // up, form bitonic sequence with half allays // for (int stage = 0; stage < numStages; stage++) { for (int passOfStage = stage; passOfStage >= 0; passOfStage--) { - if (passOfStage > 0 && passOfStage <= 8) + if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024) + { + bitonic_2048_gpu(a_data, a_N, stage, passOfStage, 1, other); + break; + } + else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512) + { + bitonic_1024_gpu(a_data, a_N, stage, passOfStage, 1, other); + break; + } + else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256) { bitonic_512_gpu(a_data, a_N, stage, passOfStage, 1, other); break; @@ -81,7 +131,17 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other) // for (int passOfStage = numStages; passOfStage >= 0; passOfStage--) { - if (passOfStage > 0 && passOfStage <= 8) + if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024) + { + bitonic_2048_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other); + break; + } + else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512) + { + bitonic_1024_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other); + break; + } + else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256) { bitonic_512_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other); break; diff --git a/bitonic_sort_gpu.h b/bitonic_sort_gpu.h index e3dd7de..6d67a11 100644 --- a/bitonic_sort_gpu.h +++ b/bitonic_sort_gpu.h @@ -5,8 +5,12 @@ struct BitonicCLArgs { cl_kernel bitonic512; + cl_kernel bitonic1024; + cl_kernel bitonic2048; cl_kernel bitonicPassK; + cl_command_queue cmdQueue; + cl_device_id dev; }; void bitonic_sort_gpu(cl_mem a_buffer, int a_N, BitonicCLArgs other); diff --git a/clew/clew.vcxproj b/clew/clew.vcxproj index 0afb9be..7f058b0 100644 --- a/clew/clew.vcxproj +++ b/clew/clew.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -22,31 +22,32 @@ {5F13E40F-C0F1-4EF4-A775-AB8BC703DE88} Win32Proj clew + 10.0.16299.0 StaticLibrary true - v120 + v141 Unicode StaticLibrary true - v120 + v141 Unicode StaticLibrary false - v120 + v141 true Unicode StaticLibrary false - v120 + v141 true Unicode diff --git a/main.cpp b/main.cpp index 754e40e..2d6d56a 100644 --- a/main.cpp +++ b/main.cpp @@ -68,6 +68,8 @@ int main(int argc, const char** argv) cl_kernel bitonicPassK = bitonicProgs.kernel("bitonic_pass_kernel"); cl_kernel bitonicOpt = bitonicProgs.kernel("bitonic_512"); + cl_kernel bitonicOpt2 = bitonicProgs.kernel("bitonic_1024"); + cl_kernel bitonicOpt3 = bitonicProgs.kernel("bitonic_2048"); auto gpuData = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int2)*data3.size(), &data3[0], &ciErr1); @@ -81,10 +83,12 @@ int main(int argc, const char** argv) { BitonicCLArgs args; + args.dev = device; args.cmdQueue = cmdQueue; args.bitonicPassK = bitonicPassK; args.bitonic512 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel - + args.bitonic1024 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel + args.bitonic2048 = nullptr; bitonic_sort_gpu_simple(gpuData, int(data2.size()), args); } @@ -109,9 +113,9 @@ int main(int argc, const char** argv) } if (passed) - std::cout << "gpu test sort simple PASSED!" << std::endl; + std::cout << "gpu test sort simple\tPASSED!" << std::endl; else - std::cout << "gpu test sort simple FAILED! (" << faileId << ")" << std::endl; + std::cout << "gpu test sort simple\tFAILED! (" << faileId << ")" << std::endl; // // @@ -123,9 +127,12 @@ int main(int argc, const char** argv) { BitonicCLArgs args; - args.cmdQueue = cmdQueue; + args.dev = device; + args.cmdQueue = cmdQueue; args.bitonicPassK = bitonicPassK; - args.bitonic512 = bitonicOpt; + args.bitonic512 = bitonicOpt; + args.bitonic1024 = bitonicOpt2; + args.bitonic2048 = bitonicOpt3; bitonic_sort_gpu(gpuData, int(data3.size()), args); } @@ -151,9 +158,9 @@ int main(int argc, const char** argv) } if (passed2) - std::cout << "gpu test sort opt PASSED!" << std::endl; + std::cout << "gpu test sort opt\tPASSED!" << std::endl; else - std::cout << "gpu test sort opt FAILED! (" << faileId << ")" << std::endl; + std::cout << "gpu test sort opt\tFAILED! (" << faileId << ")" << std::endl; std::cout << std::endl; std::cout << "[CPU]: std::sort time = " << time1 << " ms" << std::endl; diff --git a/sort.cl b/sort.cl index b2f52bb..de5f972 100644 --- a/sort.cl +++ b/sort.cl @@ -85,3 +85,101 @@ __kernel void bitonic_512(__global ElemT* theArray, int stage, int passOfStageBe theArray[blockId*512 + lid + 256] = s_array[lid + 256]; } + +__kernel void bitonic_1024(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn) +{ + int tid = get_global_id(0); + int lid = get_local_id(0); + + int blockId = tid / 512; + + __local ElemT s_array[1024]; + + s_array[lid + 0 ] = theArray[blockId * 1024 + lid + 0]; + s_array[lid + 512] = theArray[blockId * 1024 + lid + 512]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--) + { + const int j = lid; + const int r = 1 << (passOfStage); + const int lmask = r - 1; + + const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask); + const int right = left + r; + + const ElemT a = s_array[left]; + const ElemT b = s_array[right]; + + const bool cmpRes = compare(a, b); + + const ElemT minElem = cmpRes ? a : b; + const ElemT maxElem = cmpRes ? b : a; + + const int oddEven = tid >> stage; // (j >> stage) + + const bool isSwap = (oddEven & 1) & a_invertModeOn; + + const int minId = isSwap ? right : left; + const int maxId = isSwap ? left : right; + + s_array[minId] = minElem; + s_array[maxId] = maxElem; + + barrier(CLK_LOCAL_MEM_FENCE); + } + + theArray[blockId * 1024 + lid + 0] = s_array[lid + 0]; + theArray[blockId * 1024 + lid + 512] = s_array[lid + 512]; +} + + +__kernel void bitonic_2048(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn) +{ + int tid = get_global_id(0); + int lid = get_local_id(0); + + int blockId = tid / 1024; + + __local ElemT s_array[2048]; + + s_array[lid + 0 ] = theArray[blockId * 2048 + lid + 0]; + s_array[lid + 1024] = theArray[blockId * 2048 + lid + 1024]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--) + { + const int j = lid; + const int r = 1 << (passOfStage); + const int lmask = r - 1; + + const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask); + const int right = left + r; + + const ElemT a = s_array[left]; + const ElemT b = s_array[right]; + + const bool cmpRes = compare(a, b); + + const ElemT minElem = cmpRes ? a : b; + const ElemT maxElem = cmpRes ? b : a; + + const int oddEven = tid >> stage; // (j >> stage) + + const bool isSwap = (oddEven & 1) & a_invertModeOn; + + const int minId = isSwap ? right : left; + const int maxId = isSwap ? left : right; + + s_array[minId] = minElem; + s_array[maxId] = maxElem; + + barrier(CLK_LOCAL_MEM_FENCE); + } + + theArray[blockId * 2048 + lid + 0] = s_array[lid + 0]; + theArray[blockId * 2048 + lid + 1024] = s_array[lid + 1024]; +} + diff --git a/vsgl3/vsgl3.vcxproj b/vsgl3/vsgl3.vcxproj index 809156e..2a5d637 100644 --- a/vsgl3/vsgl3.vcxproj +++ b/vsgl3/vsgl3.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -21,29 +21,30 @@ {2758DD4A-78F6-452F-BBF5-4E86B46BD2EA} vsgl3 + 10.0.16299.0 StaticLibrary MultiByte true - v120 + v141 StaticLibrary MultiByte - v120 + v141 StaticLibrary MultiByte true - v120 + v141 StaticLibrary MultiByte - v120 + v141