diff --git a/INSTRUCTION.md b/INSTRUCTION.md index 792c530..8be3652 100644 --- a/INSTRUCTION.md +++ b/INSTRUCTION.md @@ -242,7 +242,6 @@ The title should be "Project 2: YOUR NAME". The template of the comment section of your pull request is attached below, you can do some copy and paste: * [Repo Link](https://link-to-your-repo) -* `Your PENNKEY` * (Briefly) Mentions features that you've completed. Especially those bells and whistles you want to highlight * Feature 0 * Feature 1 diff --git a/README.md b/README.md index b71c458..3491851 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,109 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Name: Jiahao Liu +* Tested on: Windows 10, i7-3920XM CPU @ 2.90GHz 3.10 GHz 16GB, GTX 980m SLI 8192MB (personal computer) -### (TODO: Your README) +Project Description and features implemented +====================== + +### Project Description + +This project is tend to compare running performance difference in computing prefix sum between CPU scan, naive GPU scan, efficient GPU scan and thrust scan. + +### Features implemented + +* CPU scan + +* Naive GPU scan with shared memory + +* Efficient GPU scan + +* Thrust scan + +Performance Analysis +====================== + +After trying I found for all the block size the program has the same performance, so I just chose the largest power of two as my block size. + +![](img/1.png) + +![](img/2.png) + +When includes the time of copy data from host to device, thrust scan and naive scan runs no faster then the CPU scan. + +![](img/3.png) + +Even with data copy, the efficient scan runs faster then the CPU scan. + +![](img/4.png) + +Only includes the time for computing, we can see the graph above. Thrust scan may have some built-in operations that really cost time. Naive scan roughly has more memory access then efficient scan( n^2/2 vs 3*n) but fewer calculating and comparing operations. This means the battleneck is on the memory I/O. We cannot evaluate thrust since we don't even know what happens inside. + +There is a very interesting that power-of-two thrust scan really spent some time. Comparing what I know for access array on ram with power-of-two length, this maybe the consequence for access confliction due to the time cost of finding next memory unit on hardware. + +### Running Result + +``` +**************** +** SCAN TESTS ** +**************** + [ 28 13 1 37 12 43 45 30 45 30 16 35 30 ... 33 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.001324ms (std::chrono Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9519 9552 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.001655ms (std::chrono Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9421 9449 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.006976ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9519 9552 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.00688ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9519 9552 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9421 9449 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.025152ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9519 9552 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.0216ms (CUDA Measured) + [ 0 28 41 42 79 91 134 179 209 254 284 300 335 ... 9421 9449 ] + passed -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 3 3 1 2 2 1 2 0 2 3 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001655ms (std::chrono Measured) + [ 1 3 3 1 2 2 1 2 2 3 3 1 2 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001655ms (std::chrono Measured) + [ 1 3 3 1 2 2 1 2 2 3 3 1 2 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001655ms (std::chrono Measured) + [ 1 3 3 1 2 2 1 2 2 3 3 1 2 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.474944ms (CUDA Measured) + [ 1 3 3 1 2 2 1 2 2 3 3 1 2 ... 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.416544ms (CUDA Measured) + [ 1 3 3 1 2 2 1 2 2 3 3 1 2 ... 3 1 ] + passed +``` \ No newline at end of file diff --git a/img/1.png b/img/1.png new file mode 100644 index 0000000..33c6fd9 Binary files /dev/null and b/img/1.png differ diff --git a/img/2.png b/img/2.png new file mode 100644 index 0000000..21c3ed6 Binary files /dev/null and b/img/2.png differ diff --git a/img/3.png b/img/3.png new file mode 100644 index 0000000..75a0772 Binary files /dev/null and b/img/3.png differ diff --git a/img/4.png b/img/4.png new file mode 100644 index 0000000..79161af Binary files /dev/null and b/img/4.png differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..be529f4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 384; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int a[SIZE], b[SIZE], c[SIZE]; @@ -49,42 +49,42 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -129,14 +129,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..5c1e416 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,11 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,8 +37,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO - } + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n || bools[index] == 0) { + return; + } + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..9b9e12c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i(1); i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -29,10 +33,14 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + int compactSum = 0; + + for (int i(0); i < n; ++i) { + if (idata[i] == 0) continue; + odata[compactSum++] = idata[i]; + } + + return compactSum; } /** @@ -41,10 +49,24 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + int *flag = new int[n]; + int m = 0; + + for (int i = 0; i < n; ++i) { + flag[i] = idata[i] == 0 ? 0 : 1; + } + + scan(n, odata, flag); + m = odata[n - 1]; + + for (int i = 0; i < n; ++i) { + if (flag[i] == 0) continue; + odata[odata[i]] = idata[i]; + } + + delete flag; + + return m; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..227b368 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,39 +2,114 @@ #include #include "common.h" #include "efficient.h" - +#define BLOCK_SIZE 896 namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } -} + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + __global__ void upSweep(const int n, const int step, int *data) { + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + int rIndex = n - 1 - index; + if (index - step >= 0 && (rIndex % (step * 2) == 0)) { + data[index] = data[index] + data[index - step]; + } + __syncthreads(); + } + + __global__ void downSweep(const int n, const int step, int *data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + int rIndex = n - 1 - index; + ///Only certain index is working. + if (index - step >= 0 && (rIndex % (step * 2) == 0) ) { + auto tmp = data[index]; + data[index] += data[index - step]; + data[index - step] = tmp; + } + __syncthreads(); + } + + void scanOnGPU(const int n, int *dev_data) { + dim3 blockCount = (n - 1) / BLOCK_SIZE + 1; + int step; + for (step = 1; step < n; step <<= 1) { + upSweep << > >(n, step, dev_data); + } + cudaMemset(&dev_data[n - 1], 0, sizeof(int)); + for (step >>= 1; step > 0; step >>= 1) { + downSweep << > >(n, step, dev_data); + } + } + + void scan(int n, int *odata, const int *idata) { + // TODO + int *dev_data; + cudaMalloc((void**)&dev_data, sizeof(int) * n); + cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice); + scanOnGPU(n, dev_data); + cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + // TODO + int count = 0; + int *dev_data; + int *dev_dataCopy; + int *dev_bool; + int *dev_indices; + for (int i = 0; i < n; ++i) + count = count + (idata[i] != 0); + + // device memory allocation + timer().startGpuTimer(); + + cudaMalloc((void**)&dev_data, sizeof(int) * n); + cudaMalloc((void**)&dev_dataCopy, sizeof(int) * n); + cudaMalloc((void**)&dev_bool, sizeof(int) * n); + cudaMalloc((void**)&dev_indices, sizeof(int) * n); + // copy input data to device + cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice); + dim3 blockCount = (n - 1) / BLOCK_SIZE + 1; + Common::kernMapToBoolean << > >(n, dev_bool, dev_data); + cudaMemcpy((void*)dev_indices, (const void*)dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice); + scanOnGPU(n, dev_indices); + cudaMemcpy((void*)dev_dataCopy, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToDevice); + Common::kernScatter << > >(n, dev_data, dev_dataCopy, dev_bool, dev_indices); + // copy result to host + cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + + timer().endGpuTimer(); + // free memory on device + cudaFree(dev_data); + cudaFree(dev_dataCopy); + cudaFree(dev_bool); + cudaFree(dev_indices); + + return count; + } + } +} \ No newline at end of file diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..05186b4 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,96 @@ #include "common.h" #include "naive.h" +#define BLOCK_SIZE 896 namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + // TODO: __global__ + __global__ void work(const int n, int *idata, int *odata) { + extern __shared__ int temp[]; + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) { + return; + } + /* + if (index >= n) { + return; + } + + if (index - step < 0) { + odata[index] = idata[index]; + } else { + odata[index] = idata[index] + idata[index - step]; + } + */ + + int input = 1, output = 0; + temp[index] = idata[index]; + __syncthreads(); + //33 43 45 + //0 33 43 45 + //0 33 76 88 + //0 33 76 121 + for (int step = 1; step < n; step <<= 1) { + input ^= 1; + output ^= 1; + if (index - step < 0) { + temp[output * n + index] = temp[input * n + index]; + } + else { + temp[output * n + index] = temp[input * n + index] + temp[input * n + index - step]; + } + __syncthreads(); + } + odata[index] = temp[(output*n) + index]; + } + + __global__ void moveToExclusive(const int n, int *idata, int *odata) { + + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) { + return; + } + else if (index == 0) { + odata[index] = 0; + return; + } + + odata[index] = idata[index - 1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); +// timer().startGpuTimer(); + int *dev_input; + int *dev_output; + int input = 1; + int output = 0; + // device memory allocation + cudaMalloc((void**)&dev_input, sizeof(int) * n); + cudaMalloc((void**)&dev_output, sizeof(int) * n); + cudaMemcpy(dev_output, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + dim3 blockCount = (n - 1) / BLOCK_SIZE + 1; + moveToExclusive << > >(n, dev_output, dev_input); + //We want exclusive result. Not inclusive. + timer().startGpuTimer(); + work << > >(n, dev_input, dev_output); + timer().endGpuTimer(); + cudaDeviceSynchronize(); + cudaMemcpy(odata, dev_output, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_input); + cudaFree(dev_output); +// timer().endGpuTimer(); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..523515b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,27 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { +// timer().startGpuTimer(); + int *dev_data; + cudaMalloc((void**)&dev_data, sizeof(int) * n); + cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice); + thrust::device_ptr dev_thrust_data(dev_data); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + thrust::exclusive_scan(dev_thrust_data, dev_thrust_data + n, dev_thrust_data); + timer().endGpuTimer(); + cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); +// timer().endGpuTimer(); } } }