Skip to content

Commit

Permalink
50% optimisations for nvidia (and other GPUs with large work groups);
Browse files Browse the repository at this point in the history
  • Loading branch information
FROL256 committed Feb 20, 2018
1 parent 5a9d76c commit 0becdfe
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 24 deletions.
11 changes: 6 additions & 5 deletions bitonic_sort.vcxproj
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
Expand All @@ -22,31 +22,32 @@
<ProjectGuid>{F89622BD-0044-4082-A6C6-F5D9678EBAE5}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>bitonic_sort</RootNamespace>
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
Expand Down
64 changes: 62 additions & 2 deletions bitonic_sort_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,38 @@ void bitonic_512_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a
clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic512, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
}

void bitonic_1024_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
{
const int kernelSize = (a_N >> 1);

int iSize = kernelSize;
size_t a_size = kernelSize;
size_t localWorkSize = 512;

clSetKernelArg(other.bitonic1024, 0, sizeof(cl_mem), (void*)&a_buffer);
clSetKernelArg(other.bitonic1024, 1, sizeof(cl_int), (void*)&stage);
clSetKernelArg(other.bitonic1024, 2, sizeof(cl_int), (void*)&passOfStage);
clSetKernelArg(other.bitonic1024, 3, sizeof(cl_int), (void*)&a_invertModeOn);

clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic1024, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
}

void bitonic_2048_gpu(cl_mem a_buffer, int a_N, int stage, int passOfStage, int a_invertModeOn, BitonicCLArgs other)
{
const int kernelSize = (a_N >> 1);

int iSize = kernelSize;
size_t a_size = kernelSize;
size_t localWorkSize = 1024;

clSetKernelArg(other.bitonic2048, 0, sizeof(cl_mem), (void*)&a_buffer);
clSetKernelArg(other.bitonic2048, 1, sizeof(cl_int), (void*)&stage);
clSetKernelArg(other.bitonic2048, 2, sizeof(cl_int), (void*)&passOfStage);
clSetKernelArg(other.bitonic2048, 3, sizeof(cl_int), (void*)&a_invertModeOn);

clEnqueueNDRangeKernel(other.cmdQueue, other.bitonic2048, 1, NULL, &a_size, &localWorkSize, 0, NULL, NULL);
}


void bitonic_sort_gpu_simple(cl_mem a_data, int a_N, BitonicCLArgs other)
{
Expand Down Expand Up @@ -61,13 +93,31 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
for (int temp = a_N; temp > 2; temp >>= 1)
numStages++;

// not all devices can have large work groups!
//
size_t maxWorkGroupSize = 0;
if (other.dev != 0)
clGetDeviceInfo(other.dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL);
else
maxWorkGroupSize = 256;

// up, form bitonic sequence with half allays
//
for (int stage = 0; stage < numStages; stage++)
{
for (int passOfStage = stage; passOfStage >= 0; passOfStage--)
{
if (passOfStage > 0 && passOfStage <= 8)
if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
{
bitonic_2048_gpu(a_data, a_N, stage, passOfStage, 1, other);
break;
}
else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
{
bitonic_1024_gpu(a_data, a_N, stage, passOfStage, 1, other);
break;
}
else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
{
bitonic_512_gpu(a_data, a_N, stage, passOfStage, 1, other);
break;
Expand All @@ -81,7 +131,17 @@ void bitonic_sort_gpu(cl_mem a_data, int a_N, BitonicCLArgs other)
//
for (int passOfStage = numStages; passOfStage >= 0; passOfStage--)
{
if (passOfStage > 0 && passOfStage <= 8)
if (passOfStage > 0 && passOfStage <= 10 && maxWorkGroupSize >= 1024)
{
bitonic_2048_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
break;
}
else if (passOfStage > 0 && passOfStage <= 9 && maxWorkGroupSize >= 512)
{
bitonic_1024_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
break;
}
else if (passOfStage > 0 && passOfStage <= 8 && maxWorkGroupSize >= 256)
{
bitonic_512_gpu(a_data, a_N, numStages - 1, passOfStage, 0, other);
break;
Expand Down
4 changes: 4 additions & 0 deletions bitonic_sort_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
struct BitonicCLArgs
{
cl_kernel bitonic512;
cl_kernel bitonic1024;
cl_kernel bitonic2048;
cl_kernel bitonicPassK;

cl_command_queue cmdQueue;
cl_device_id dev;
};

void bitonic_sort_gpu(cl_mem a_buffer, int a_N, BitonicCLArgs other);
Expand Down
11 changes: 6 additions & 5 deletions clew/clew.vcxproj
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
Expand All @@ -22,31 +22,32 @@
<ProjectGuid>{5F13E40F-C0F1-4EF4-A775-AB8BC703DE88}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>clew</RootNamespace>
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
Expand Down
21 changes: 14 additions & 7 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ int main(int argc, const char** argv)

cl_kernel bitonicPassK = bitonicProgs.kernel("bitonic_pass_kernel");
cl_kernel bitonicOpt = bitonicProgs.kernel("bitonic_512");
cl_kernel bitonicOpt2 = bitonicProgs.kernel("bitonic_1024");
cl_kernel bitonicOpt3 = bitonicProgs.kernel("bitonic_2048");

auto gpuData = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int2)*data3.size(), &data3[0], &ciErr1);

Expand All @@ -81,10 +83,12 @@ int main(int argc, const char** argv)

{
BitonicCLArgs args;
args.dev = device;
args.cmdQueue = cmdQueue;
args.bitonicPassK = bitonicPassK;
args.bitonic512 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel

args.bitonic1024 = nullptr; // bitonic_sort_gpu_simple don't use shmem kernel
args.bitonic2048 = nullptr;
bitonic_sort_gpu_simple(gpuData, int(data2.size()), args);
}

Expand All @@ -109,9 +113,9 @@ int main(int argc, const char** argv)
}

if (passed)
std::cout << "gpu test sort simple PASSED!" << std::endl;
std::cout << "gpu test sort simple\tPASSED!" << std::endl;
else
std::cout << "gpu test sort simple FAILED! (" << faileId << ")" << std::endl;
std::cout << "gpu test sort simple\tFAILED! (" << faileId << ")" << std::endl;

//
//
Expand All @@ -123,9 +127,12 @@ int main(int argc, const char** argv)

{
BitonicCLArgs args;
args.cmdQueue = cmdQueue;
args.dev = device;
args.cmdQueue = cmdQueue;
args.bitonicPassK = bitonicPassK;
args.bitonic512 = bitonicOpt;
args.bitonic512 = bitonicOpt;
args.bitonic1024 = bitonicOpt2;
args.bitonic2048 = bitonicOpt3;

bitonic_sort_gpu(gpuData, int(data3.size()), args);
}
Expand All @@ -151,9 +158,9 @@ int main(int argc, const char** argv)
}

if (passed2)
std::cout << "gpu test sort opt PASSED!" << std::endl;
std::cout << "gpu test sort opt\tPASSED!" << std::endl;
else
std::cout << "gpu test sort opt FAILED! (" << faileId << ")" << std::endl;
std::cout << "gpu test sort opt\tFAILED! (" << faileId << ")" << std::endl;

std::cout << std::endl;
std::cout << "[CPU]: std::sort time = " << time1 << " ms" << std::endl;
Expand Down
98 changes: 98 additions & 0 deletions sort.cl
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,101 @@ __kernel void bitonic_512(__global ElemT* theArray, int stage, int passOfStageBe
theArray[blockId*512 + lid + 256] = s_array[lid + 256];

}

__kernel void bitonic_1024(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
{
int tid = get_global_id(0);
int lid = get_local_id(0);

int blockId = tid / 512;

__local ElemT s_array[1024];

s_array[lid + 0 ] = theArray[blockId * 1024 + lid + 0];
s_array[lid + 512] = theArray[blockId * 1024 + lid + 512];

barrier(CLK_LOCAL_MEM_FENCE);

for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
{
const int j = lid;
const int r = 1 << (passOfStage);
const int lmask = r - 1;

const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
const int right = left + r;

const ElemT a = s_array[left];
const ElemT b = s_array[right];

const bool cmpRes = compare(a, b);

const ElemT minElem = cmpRes ? a : b;
const ElemT maxElem = cmpRes ? b : a;

const int oddEven = tid >> stage; // (j >> stage)

const bool isSwap = (oddEven & 1) & a_invertModeOn;

const int minId = isSwap ? right : left;
const int maxId = isSwap ? left : right;

s_array[minId] = minElem;
s_array[maxId] = maxElem;

barrier(CLK_LOCAL_MEM_FENCE);
}

theArray[blockId * 1024 + lid + 0] = s_array[lid + 0];
theArray[blockId * 1024 + lid + 512] = s_array[lid + 512];
}


__kernel void bitonic_2048(__global ElemT* theArray, int stage, int passOfStageBegin, int a_invertModeOn)
{
int tid = get_global_id(0);
int lid = get_local_id(0);

int blockId = tid / 1024;

__local ElemT s_array[2048];

s_array[lid + 0 ] = theArray[blockId * 2048 + lid + 0];
s_array[lid + 1024] = theArray[blockId * 2048 + lid + 1024];

barrier(CLK_LOCAL_MEM_FENCE);

for (int passOfStage = passOfStageBegin; passOfStage >= 0; passOfStage--)
{
const int j = lid;
const int r = 1 << (passOfStage);
const int lmask = r - 1;

const int left = ((j >> passOfStage) << (passOfStage + 1)) + (j & lmask);
const int right = left + r;

const ElemT a = s_array[left];
const ElemT b = s_array[right];

const bool cmpRes = compare(a, b);

const ElemT minElem = cmpRes ? a : b;
const ElemT maxElem = cmpRes ? b : a;

const int oddEven = tid >> stage; // (j >> stage)

const bool isSwap = (oddEven & 1) & a_invertModeOn;

const int minId = isSwap ? right : left;
const int maxId = isSwap ? left : right;

s_array[minId] = minElem;
s_array[maxId] = maxElem;

barrier(CLK_LOCAL_MEM_FENCE);
}

theArray[blockId * 2048 + lid + 0] = s_array[lid + 0];
theArray[blockId * 2048 + lid + 1024] = s_array[lid + 1024];
}

11 changes: 6 additions & 5 deletions vsgl3/vsgl3.vcxproj
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
Expand All @@ -21,29 +21,30 @@
<PropertyGroup Label="Globals">
<ProjectGuid>{2758DD4A-78F6-452F-BBF5-4E86B46BD2EA}</ProjectGuid>
<RootNamespace>vsgl3</RootNamespace>
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<WholeProgramOptimization>true</WholeProgramOptimization>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<WholeProgramOptimization>true</WholeProgramOptimization>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
Expand Down

0 comments on commit 0becdfe

Please sign in to comment.