diff --git a/README.md b/README.md index 0e38ddb..d9dec4a 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,81 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Emily Vo + * [LinkedIn](linkedin.com/in/emilyvo), [personal website](emilyhvo.com) +* Tested on: Windows 10, i7-7700HQ @ 2.8GHz 16GB, GTX 1060 6GB (Personal Computer) +Updated the CMakeLists.txt to sm_61. -### (TODO: Your README) +### PERFORMANCE ANALYSIS -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). +![](img/runtime_vs_size.PNG) +* To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. Write a brief explanation of the phenomena you see here. + +The first thrust run (power of 2) is much slower than the second (non power of 2), and much slower than all implementation at all array sizes except those greater than 2^20. This might be due to the first instance of invoking thrust requiring a lot of extra time to set up the library and any utility classes. + +The CPU implementation is surprisingly fast compared to the parallel implementations. This is likely due to the lack of overhead in kernel invocations, and speed of the cache, as the CPU processes the array sequentially. + +The work efficient implementation is slower than the naive solution. This is surprising, as the naive implementation does less work, but it appears that once again, like the CPU implementation, the fewer kernal invocations, as well as the lack of copying memory between the host and device, makes the naive solution faster. As the arrays get larger, I would expect this amount of overhead to stay constant, and so I would expect the work efficient implementation to eventually be faster. + +When we look at the curves, we see that the CPU implementation grows linearly as the array size grows (the line looks exponential as the x axis increases exponentially). This is expected, as the number of operations is linear. The other curves grow much slower. The Thrust implementation stays almost constant, indicating that the majority of the work it does is not actually related to computing the scan. + +* Paste the output of the test program into a triple-backtick block in your README. +``` +**************** +** SCAN TESTS ** +**************** + [ 0 1 2 3 4 5 6 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000365ms (std::chrono Measured) + [ 0 0 1 3 6 10 15 21 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 0 0 1 3 6 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.014464ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.012608ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.116896ms (CUDA Measured) + [ 0 0 1 3 6 10 15 21 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.08192ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 4.4583ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.014304ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 2 1 3 2 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 3 2 1 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000365ms (std::chrono Measured) + [ 3 2 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 2.09687ms (std::chrono Measured) + [ 3 2 1 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.088352ms (CUDA Measured) + [ 3 2 1 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.081344ms (CUDA Measured) + [ 3 2 1 3 ] + passed +``` diff --git a/img/runtime_vs_size.PNG b/img/runtime_vs_size.PNG new file mode 100644 index 0000000..525b037 Binary files /dev/null and b/img/runtime_vs_size.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..37729d3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,83 +13,208 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; int main(int argc, char* argv[]) { - // Scan tests + //for (int i = 12; i <= 20; i++) { + int SIZE = 1 << 8; // feel free to change the size of array + int NPOT = SIZE - 3; // Non-Power-Of-Two + int *a = new int[SIZE]; + int *b = new int[SIZE]; + int *c = new int[SIZE]; + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + //printArray(SIZE,x` a, true); + + // initialize b using StreamCompaction::CPU::scan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); + //printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(SIZE, b, true); + //printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(NPOT, b, true); + //printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + + zeroArray(SIZE, c); + //printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + //printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + //printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + //printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + //printArray(SIZE, a, true); + int count, expectedCount, expectedNPOT; + + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + //printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + //printArray(count, b, true); + //printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + //printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + //printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(count, c, true); + //printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + //printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + //printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + //printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); + std::cout << "" << std::endl; + delete[] a; + delete[] b; + delete[] c; + + //} + + // Scan tests +//int a[SIZE] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + + + + system("pause"); // stop Win32 console from closing on exit + + + /* + // Scan tests + //int a[SIZE] = { 0, 1, 2, 3, 4, 5, 6, 7 }; printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + //genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + //printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); + //printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + //printArray(SIZE, b, true); + //printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); + //printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); + //printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); + //printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); + //printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); +//printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); + //printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); + //printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); + //printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); @@ -111,37 +236,37 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); + //printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); + //printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); + //printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); - printDesc("cpu compact with scan"); + //printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); + //printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); + //printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); @@ -151,4 +276,5 @@ int main(int argc, char* argv[]) { delete[] a; delete[] b; delete[] c; + */ } diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..6cbcd68 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,8 +1,8 @@ #pragma once -#include -#include -#include +#include +#include +#include #include #include @@ -18,7 +18,7 @@ int cmpArrays(int n, T *a, T *b) { } void printDesc(const char *desc) { - printf("==== %s ====\n", desc); + printf("%s\n", desc); } template @@ -69,8 +69,9 @@ void printArray(int n, int *a, bool abridged = false) { printf("]\n"); } -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; +template +void printElapsedTime(T time, std::string note = "") +{ + //std::cout << " elapsed time: " << time << "ms " << note << std::endl; + std::cout << time << std::endl; } \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..b3f5567 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,26 +1,45 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + void printArr(int n, int *arr) { + printf("["); + for (int i = 0; i < n; i++) { + printf("%d ", arr[i]); + } + printf("]"); + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool end = true; + try { + timer().startCpuTimer(); + } + catch (std::exception) { + end = false; + } + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } + if (end) { + timer().endCpuTimer(); + } } /** @@ -30,9 +49,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int odataIdx = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[odataIdx] = idata[i]; + odataIdx++; + } + } timer().endCpuTimer(); - return -1; + return odataIdx; } /** @@ -42,9 +67,24 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *tmpArr = new int[n]; + int *scanRes = new int[n]; + for (int i = 0; i < n; i++) { + tmpArr[i] = idata[i] != 0 ? 1 : 0; + } + + scan(n, scanRes, tmpArr); + //printf("scanRes: "); + //printArr(n, scanRes); + + for (int i = 0; i < n; i++) { + if (tmpArr[i] == 1) { + odata[scanRes[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return scanRes[n - 1]; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..4b17aad 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,108 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpward(int n, int d, int *data) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + + int power_d = 1 << d; + int power_d1 = 1 << (d + 1); + + if (k % power_d1 == 0) { + int idx = k + power_d - 1; + int idx_1 = k + power_d1 - 1; + data[idx_1] += data[idx]; + } + + } + + __global__ void kernDownward(int n, int d, int *data) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + + int power_d = 1 << d; + int power_d1 = 1 << (d + 1); + + if (k % power_d1 == 0) { + int idx = k + power_d - 1; + int idx_1 = k + power_d1 - 1; + int t = data[idx]; + data[idx] = data[idx_1]; + data[idx_1] += t; + } + } + + void scanEfficient(int n, int *dev_data) { + int logn = ilog2ceil(n); + int length = 1 << ilog2ceil(n); + int blockSize = 128; + dim3 blocksPerGrid((length + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + for (int i = 0; i < logn; i++) { + kernUpward << > > (length, i, dev_data); + checkCUDAError("kernUpwardSweep failed", __LINE__); + } + int zero = 0; + cudaMemcpy(dev_data + length - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); + for (int i = logn - 1; i >= 0; i--) { + kernDownward << > > (length, i, dev_data); + checkCUDAError("kernDownwardSweep failed", __LINE__); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int length = 1 << ilog2ceil(n); + int size = length * sizeof(int); + int *idata_padded = new int[length]; + + // pad the array if the length is not a power of 2 + for (int i = 0; i < n; i++) { + idata_padded[i] = idata[i]; + } + for (int i = n; i < length; i++) { + idata_padded[i] = 0; + } + + // copy the padded data to the device + int *dev_data; + cudaMalloc((void**)&dev_data, size); + checkCUDAError("ERROR: cudaMalloc dev_dat", __LINE__); + cudaMemcpy(dev_data, idata_padded, size, cudaMemcpyHostToDevice); + checkCUDAError("ERROR: cudaMemcpy to device", __LINE__); + + // perform the scan + bool end = true; + try { + timer().startGpuTimer(); + } + catch (std::exception) { + end = false; + } + scanEfficient(n, dev_data); + if (end) { + timer().endGpuTimer(); + } + + // copy the results back to host and free data + cudaMemcpy(idata_padded, dev_data, size, cudaMemcpyDeviceToHost); + checkCUDAError("ERROR: cudaMemcpy to host", __LINE__); + cudaFree(dev_data); + + // copy to output data and free padded array + for (int i = 0; i < n; i++) { + odata[i] = idata_padded[i]; + } + + free(idata_padded); + } /** @@ -30,11 +125,75 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + __global__ void kernCreateMask(int n, int *dev_odata, int *dev_idata) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + + dev_odata[k] = dev_idata[k] == 0 ? 0 : 1; + } + + __global__ void kernCompact(int n, int *tmp, int* scanRes, int *dev_odata, int *dev_idata) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + if (tmp[k] == 1) { + dev_odata[scanRes[k]] = dev_idata[k]; + } + } + int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + int length = 1 << ilog2ceil(n); + int size = length * sizeof(int); + int *idata_padded = new int[length]; + + // pad the array if the length is not a power of 2 + for (int i = 0; i < n; i++) { + idata_padded[i] = idata[i]; + } + for (int i = n; i < length; i++) { + idata_padded[i] = 0; + } + + int *dev_odata, *dev_idata, *dev_tmp, *dev_scanRes; + int *scanRes = new int[length]; + + // allocate the buffers and copy the data + cudaMalloc((void**)&dev_odata, size); + checkCUDAError("ERROR: cudaMalloc of dev_odata"); + cudaMalloc((void**)&dev_idata, size); + checkCUDAError("ERROR: cudaMalloc of dev_idata"); + cudaMalloc((void**)&dev_tmp, size); + checkCUDAError("ERROR: cudaMalloc of tmp"); + cudaMalloc((void**)&dev_scanRes, size); + checkCUDAError("ERROR: cudaMalloc of scanRes"); + cudaMemcpy(dev_idata, idata_padded, size, cudaMemcpyHostToDevice); + checkCUDAError("ERROR: cudaMemcpy idata failed"); + + timer().startGpuTimer(); + kernCreateMask << > > (n, dev_tmp, dev_idata); + cudaMemcpy(dev_scanRes, dev_tmp, size, cudaMemcpyDeviceToDevice); + scanEfficient(n, dev_scanRes); + kernCompact << > > (n, dev_tmp, dev_scanRes, dev_odata, dev_idata); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, size, cudaMemcpyDeviceToHost); + cudaMemcpy(scanRes, dev_scanRes, size, cudaMemcpyDeviceToHost); + int result = scanRes[length - 1]; + checkCUDAError("ERROR: cudaMemcpy of output data"); + + cudaFree(dev_odata); + cudaFree(dev_idata); + cudaFree(dev_tmp); + cudaFree(dev_scanRes); + free(scanRes); + + return result; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..549225e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,21 +5,75 @@ namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void kernScanNaive(int n, int d, int *odata, int *idata) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + + int power = 1 << (d - 1); + if (k >= power) { + odata[k] = idata[k - power] + idata[k]; + } + else { + odata[k] = idata[k]; + } + } + + __global__ void kernShiftRight(int n, int *odata, int *idata) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + + odata[k] = (k == 0) ? 0 : idata[k - 1]; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + int *dev_odata, *dev_idata; + int size = n * sizeof(int); + + // allocate the buffers and copy the data + cudaMalloc((void**)&dev_odata, size); + checkCUDAError("ERROR: cudaMalloc of dev_odata", __LINE__); + cudaMalloc((void**)&dev_idata, size); + checkCUDAError("ERROR: cudaMalloc of dev_idata", __LINE__); + cudaMemcpy(dev_idata, idata, size, cudaMemcpyHostToDevice); + checkCUDAError("ERROR: cudaMemcpy idata failed", __LINE__); + timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int its = ilog2ceil(n); + for (int i = 1; i <= its; i++) { + kernScanNaive << > > (n, i, dev_odata, dev_idata); + checkCUDAError("ERROR: naive scan", __LINE__); + + std::swap(dev_odata, dev_idata); + } + + // convert from inclusive to exclusive + kernShiftRight << > > (n, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, size, cudaMemcpyDeviceToHost); + checkCUDAError("ERROR: cudaMemcpy of output data", __LINE__); + + cudaFree(dev_odata); + cudaFree(dev_idata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..405d710 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,27 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::host_vector hv(n); + thrust::copy(idata, idata + n, hv.begin()); + thrust::device_vector i_dv = hv; + thrust::device_vector o_dv(n); + + timer().startGpuTimer(); + thrust::exclusive_scan(i_dv.begin(), i_dv.end(), o_dv.begin()); timer().endGpuTimer(); + + thrust::copy(o_dv.begin(), o_dv.end(), odata); } } }