diff --git a/README.md b/README.md
index 0e38ddb..899a135 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,53 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yan Wu
+ * [LinkedIn](https://www.linkedin.com/in/yan-wu-a71270159/)
+* Tested on: Windows 10 Education, i7-8750H @ 2.2GHz 16GB, GTX 1060 6GB (Personal Laptop)
-### (TODO: Your README)
+### Project Description
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project let us implement a few different versions of the Scan (Prefix Sum) algorithm.
+1. Implementing a CPU version of the algorithm.
+2. Writing a few GPU implementations: "naive" and "work-efficient."
+3. Using some of these to implement GPU stream compaction.
+
+* [Algorithm Slides:](https://docs.google.com/presentation/d/1ETVONA7QDM-WqsEj4qVOGD6Kura5I6E9yqH-7krnwZ0/edit#slide=id.p126)
+* Algorithm Demonstration:
+ * Naive Inclusive Scan:
+
+ * Work Efficient Scan (up-sweep):
+
+ * Work Efficient Scan (down-sweep):
+
+
+### Result and Performance Analysis
+
+* A sample of outcomes:
+ * Scan Test:
+
+ * Stream Compaction Test:
+
+
+* Result Sheets:
+ * Scan Test Result:
+
+ * Compaction Test Result:
+
+
+* Analysis:
+ * Scan Test Charts:
+
+ * Compaction Test Charts:
+
+ Just to mention, the two red lines regarding "cpu compact with scan" in both above charts are the same line.
+ * Analysis:
+ First we can see that GPU isn't performing significantly better when the test array is short. But when array becomes very large, it also becomes a burden for the CPU and the executing time increases almost exponentially. Under this circumstance we should perceive a much greater performance outcome by GPU.
+ Taking a closer look, my work efficient algorithm is slower than the naive method when the test array is relatively shorter. As array size increases, work efficient method has a trend of better performance. Thrust method is steady. It may seems slow at first, but when array size increases, its time remains almost the same. Comparing all these methods, thrust scan is clearly the best when array is incredibly large.
+
+* Q & A:
+ * Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.
+ My block size was 128.
+ * Why does work efficien method slower than naive?
+ When implementing algorithm from the course slides, we did reduced the calculation count since we only need to compute half of the elements involved each round. Although we did this, half of each warp didn't even work. Implementing a parallel reduction and reduce the working warp number could save a lot time on this issue.
diff --git a/img/CompactTestsSample.PNG b/img/CompactTestsSample.PNG
new file mode 100644
index 0000000..d33d9c9
Binary files /dev/null and b/img/CompactTestsSample.PNG differ
diff --git a/img/compactChart.PNG b/img/compactChart.PNG
new file mode 100644
index 0000000..6830a99
Binary files /dev/null and b/img/compactChart.PNG differ
diff --git a/img/compactSheet.PNG b/img/compactSheet.PNG
new file mode 100644
index 0000000..d038a75
Binary files /dev/null and b/img/compactSheet.PNG differ
diff --git a/img/downSweep.PNG b/img/downSweep.PNG
new file mode 100644
index 0000000..4ea1577
Binary files /dev/null and b/img/downSweep.PNG differ
diff --git a/img/scanChart.PNG b/img/scanChart.PNG
new file mode 100644
index 0000000..45300dd
Binary files /dev/null and b/img/scanChart.PNG differ
diff --git a/img/scanSheet.PNG b/img/scanSheet.PNG
new file mode 100644
index 0000000..70519cb
Binary files /dev/null and b/img/scanSheet.PNG differ
diff --git a/img/scanTestsSample.PNG b/img/scanTestsSample.PNG
new file mode 100644
index 0000000..b83b5e7
Binary files /dev/null and b/img/scanTestsSample.PNG differ
diff --git a/img/upSweep.PNG b/img/upSweep.PNG
new file mode 100644
index 0000000..f1a6bc9
Binary files /dev/null and b/img/upSweep.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 1850161..e3ada2a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 24; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
@@ -126,12 +126,13 @@ int main(int argc, char* argv[]) {
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
- zeroArray(SIZE, c);
- printDesc("cpu compact with scan");
- count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(count, c, true);
- printCmpLenResult(count, expectedCount, b, c);
+ zeroArray(SIZE, c);
+ printDesc("cpu compact with scan");
+ count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ //printArray(count, c, true);
+ expectedCount = count;
+ printCmpLenResult(count, expectedCount, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..4bb0dc2 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
cuda_add_library(stream_compaction
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_61
)
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..6e9f616 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
#include "common.h"
+#include "device_launch_parameters.h"
void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
@@ -24,6 +25,9 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+ bools[index] = idata[index] == 0 ? 0 : 1;
}
/**
@@ -33,6 +37,11 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+ if (bools[index] == 1) {
+ odata[indices[index]] = idata[index];
+ }
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..2bd0037 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
#include
#include "cpu.h"
-#include "common.h"
+#include "common.h"
namespace StreamCompaction {
namespace CPU {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
@@ -18,9 +18,13 @@ namespace StreamCompaction {
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startCpuTimer();
- // TODO
- timer().endCpuTimer();
+ //timer().startCpuTimer();
+ // TODO
+ odata[0] = 0;
+ for (int i = 1; i < n; ++i) {
+ odata[i] = odata[i - 1] + idata[i - 1];
+ }
+ //timer().endCpuTimer();
}
/**
@@ -31,8 +35,14 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
+ int index = 0;
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[index++] = idata[i];
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return index;
}
/**
@@ -41,10 +51,24 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
- timer().startCpuTimer();
+ // allocate space for two middle status arrays
+ int *temp1 = (int*)malloc(n * sizeof(int));
+ int *temp2 = (int*)malloc(n * sizeof(int));
+ timer().startCpuTimer();
// TODO
+ for (int i = 0; i < n; ++i) {
+ temp1[i] = idata[i] == 0 ? 0 : 1;
+ }
+ scan(n, temp2, temp1);
+ int index = 0;
+ for (int i = 0; i < n; ++i) {
+ if (temp1[i] == 1) {
+ odata[temp2[i]] = idata[i];
+ index++;
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return index;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..bee4a74 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,23 +2,62 @@
#include
#include "common.h"
#include "efficient.h"
+#include "device_launch_parameters.h"
+#define blockSize 128
+
namespace StreamCompaction {
namespace Efficient {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
+ __global__ void upSweep(int n, int k, int *dev) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+
+ if ((index % (2 * k) == 0) && (index + (2 * k) <= n))
+ dev[index + (2 * k) - 1] += dev[index + k - 1];
+ }
+
+ __global__ void downSweep(int n, int k, int *idata) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+ // need to check boundary
+ if ((index % (2 * k) == 0) && (index + (2 * k) <= n)) {
+ int temp = idata[index + k - 1];
+ idata[index + k - 1] = idata[index + (2 * k) - 1];
+ idata[index + (2 * k) - 1] += temp;
+ }
+ }
+
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
+ int *exclusive;
+ int length = pow(2, ilog2ceil(n));
+ cudaMalloc((void**)&exclusive, length * sizeof(int));
+ cudaMemset(exclusive, 0, length * sizeof(int));
+ cudaMemcpy(exclusive, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ dim3 fullBlocksPerGrid((length + blockSize - 1) / blockSize);
+ timer().startGpuTimer();
+ // TODO
+ // up-sweep
+ for (int d = 1; d < length; d *= 2) {
+ upSweep<<< fullBlocksPerGrid, blockSize >>>(length, d, exclusive);
+ }
+ cudaMemset(exclusive + length - 1, 0, sizeof(int));
+ // down-sweep
+ for (int d = length / 2; d >= 1; d /= 2) {
+ downSweep<<< fullBlocksPerGrid, blockSize >>>(length, d, exclusive);
+ }
timer().endGpuTimer();
+ cudaMemcpy(odata, exclusive, n * sizeof(int), cudaMemcpyDeviceToHost);
+ cudaFree(exclusive);
}
/**
@@ -31,10 +70,42 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
+ int *bools;
+ int *indices;
+ int *i_aug;
+ int *o_aug;
+ int length = pow(2, ilog2ceil(n));
+ cudaMalloc((void**)&bools, length * sizeof(int));
+ cudaMalloc((void**)&indices, length * sizeof(int));
+ cudaMalloc((void**)&i_aug, n * sizeof(int));
+ cudaMalloc((void**)&o_aug, n * sizeof(int));
+
+ cudaMemset(bools, 0, length * sizeof(int));
+ cudaMemset(indices, 0, length * sizeof(int));
+
+ cudaMemcpy(i_aug, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ dim3 fullBlocksPerGrid((length + blockSize - 1) / blockSize);
+
+ //timer().startGpuTimer();
// TODO
- timer().endGpuTimer();
- return -1;
+ StreamCompaction::Common::kernMapToBoolean<<< fullBlocksPerGrid, blockSize >>>(n, bools, i_aug);
+ scan(n, indices, bools);
+ StreamCompaction::Common::kernScatter <<< fullBlocksPerGrid, blockSize >>>(n, o_aug, i_aug, bools, indices);
+ //timer().endGpuTimer();
+ cudaMemcpy(odata, o_aug, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ int num1 = 0;
+ cudaMemcpy(&num1, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+ int num2 = 0;
+ cudaMemcpy(&num2, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(bools);
+ cudaFree(indices);
+ cudaFree(i_aug);
+ cudaFree(o_aug);
+
+ return num1 + num2;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..16d6b1e 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,24 +2,62 @@
#include
#include "common.h"
#include "naive.h"
+#include "device_launch_parameters.h"
+
+#define blockSize 128
+
namespace StreamCompaction {
namespace Naive {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
// TODO: __global__
-
+ __global__ void naiveParallelScan(int n, int k, int *odata, int *idata) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+ if (index >= k) {
+ odata[index] = idata[index] + idata[index - k];
+ }
+ else {
+ odata[index] = idata[index];
+ }
+ }
+ __global__ void toExclusive(int n, int *odata, int *idata) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+ if (index > 0) {
+ odata[index] = idata[index - 1];
+ }
+ else {
+ odata[index] = 0;
+ }
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ int* inclusive;
+ int* exclusive;
+ cudaMalloc((int**)&inclusive, n * sizeof(int));
+ cudaMalloc((int**)&exclusive, n * sizeof(int));
+ cudaMemcpy(inclusive, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
timer().startGpuTimer();
// TODO
+ for (int d = 0; d <= ilog2ceil(n); d++) {
+ naiveParallelScan<<< fullBlocksPerGrid, blockSize >>>(n, pow(2.0, d), exclusive, inclusive);
+ // ping-pong
+ std::swap(exclusive, inclusive);
+ }
+ toExclusive<<< fullBlocksPerGrid, blockSize >>>(n, exclusive, inclusive);
timer().endGpuTimer();
+ cudaMemcpy(odata, exclusive, n * sizeof(int), cudaMemcpyDeviceToHost);
+ cudaFree(inclusive);
+ cudaFree(exclusive);
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..5bd0963 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,29 @@
namespace StreamCompaction {
namespace Thrust {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
+ thrust::device_vector dv_in(idata, idata + n);
+ thrust::device_vector dv_out(odata, odata + n);
+
+ timer().startGpuTimer();
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
- timer().endGpuTimer();
+ thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+ timer().endGpuTimer();
+ thrust::copy(dv_out.begin(), dv_out.end(), odata);
+
+
}
}
}