CIS565-Fall-2018 · ascn · Sep 19, 2018
diff --git a/README.md b/README.md
@@ -3,12 +3,85 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Alexander Chan
+* Tested on: Windows 10 Version 1803, i7-5820k @ 3.70 GHz 16GB, GTX 1080 @ 1620 MHz 8GB (Personal Computer)
 
-### (TODO: Your README)
+### README
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Implemented CPU scan
+* Implemented naive and work efficient scan
+* Implemented stream compaction using work efficient scan
 
+## Performance Analysis
+Here are metrics for scan, including CPU, naive, work efficient, and thrust.
+
+![](img/scan.png)
+
+The horizontal axis indicates the array size, doubling with every tick. Thus, to visualize a better relationship, the vertical axis is a log scale.
+
+As we can see, the CPU scan is almost a perfect linear relationship between array size and time. This makes sense as there are O(N) operations in scan. For smaller array sizes,
+the CPU scan is faster than all GPU implementations. This is likely due to constant overhead of kernel invocations, and the fact that the GPU implementation only uses slow global memory, while the CPU was able to take advantage of its cache, which excelled in the sequential lookups and writes of small elements in the scan algorithm. The work efficient implementations are slower than the naive implementations, probably because there are twice as many kernel invocations, in addition to using more global memory. Thrust's implementation is slower, but constant. This probably means that Thrust is doing other work in addition to performing the scan.
+
+Here are metrics for stream compaction. Once again, the horizontal axis indicates the array size, doubling with every tick, and the vertical axis is a log scale.
+
+![](img/stream-compact.png)
+
+## Output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  38   4  37  27  41  44  46   5   2   8  23   9  12 ...  13   0 ]
+cpu scan, power-of-two
+0.032968
+    [   0  38  42  79 106 147 191 237 242 244 252 275 284 ... 399406 399419 ]
+cpu scan, non-power-of-two
+0.03359
+    [   0  38  42  79 106 147 191 237 242 244 252 275 284 ... 399313 399349 ]
+    passed
+naive scan, power-of-two
+0.041984
+    passed
+naive scan, non-power-of-two
+0.04096
+    passed
+work-efficient scan, power-of-two
+0.136192
+    passed
+work-efficient scan, non-power-of-two
+0.136192
+    a[8384] = 203041, b[8384] = 602399
+    FAIL VALUE
+thrust scan, power-of-two
+4.56704
+    passed
+thrust scan, non-power-of-two
+0.048096
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   0   1   0   0   1   0   2   1   2   3   1   1 ...   1   0 ]
+cpu compact without scan, power-of-two
+0.039811
+    [   2   1   1   2   1   2   3   1   1   1   2   1   3 ...   2   1 ]
+    passed
+cpu compact without scan, non-power-of-two
+0.038878
+    [   2   1   1   2   1   2   3   1   1   1   2   1   3 ...   1   2 ]
+    passed
+cpu compact with scan
+4.0355
+    [   2   1   1   2   1   2   3   1   1   1   2   1   3 ...   2   1 ]
+    passed
+work-efficient compact, power-of-two
+3.71302
+    [   2   1   1   2   1   2   3   1   1   1   2   1   3 ...   2   1 ]
+    passed
+work-efficient compact, non-power-of-two
+4.68685
+    [   2   1   1   2   1   2   3   1   1   1   2   1   3 ...   1   2 ]
+    passed
+```
diff --git a/img/scan.png b/img/scan.png
diff --git a/img/stream-compact.png b/img/stream-compact.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,141 +13,135 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 14; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
+
+
+ printf("\n");
+ printf("****************\n");
+ printf("** SCAN TESTS **\n");
+ printf("****************\n");
+
+ genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+ a[SIZE - 1] = 0;
+ printArray(SIZE, a, true);
+
+ // initialize b using StreamCompaction::CPU::scan you implement
+ // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+ // At first all cases passed because b && c are all zeroes.
+ zeroArray(SIZE, b);
+ printDesc("cpu scan, power-of-two");
+ StreamCompaction::CPU::scan(SIZE, b, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ printArray(SIZE, b, true);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu scan, non-power-of-two");
+ StreamCompaction::CPU::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ printArray(NPOT, b, true);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("naive scan, power-of-two");
+ StreamCompaction::Naive::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("naive scan, non-power-of-two");
+ StreamCompaction::Naive::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient scan, power-of-two");
+ StreamCompaction::Efficient::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient scan, non-power-of-two");
+ StreamCompaction::Efficient::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(NPOT, c, true);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("thrust scan, power-of-two");
+ StreamCompaction::Thrust::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("thrust scan, non-power-of-two");
+ StreamCompaction::Thrust::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(NPOT, c, true);
+ printCmpResult(NPOT, b, c);
+
+ printf("\n");
+ printf("*****************************\n");
+ printf("** STREAM COMPACTION TESTS **\n");
+ printf("*****************************\n");
+
+ // Compaction tests
+
+ genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+ a[SIZE - 1] = 0;
+ printArray(SIZE, a, true);
+
+ int count, expectedCount, expectedNPOT;
+
+ // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+ // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+ zeroArray(SIZE, b);
+ printDesc("cpu compact without scan, power-of-two");
+ count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ expectedCount = count;
+ printArray(count, b, true);
+ printCmpLenResult(count, expectedCount, b, b);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu compact without scan, non-power-of-two");
+ count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ expectedNPOT = count;
+ printArray(count, c, true);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu compact with scan");
+ count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ printArray(count, c, true);
+ printCmpLenResult(count, expectedCount, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient compact, power-of-two");
+ count = StreamCompaction::Efficient::compact(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(count, c, true);
+ printCmpLenResult(count, expectedCount, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient compact, non-power-of-two");
+ count = StreamCompaction::Efficient::compact(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(count, c, true);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
+ system("pause"); // stop Win32 console from closing on exit
 	delete[] a;
 	delete[] b;
 	delete[] c;

diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -18,7 +18,7 @@ int cmpArrays(int n, T *a, T *b) {
 }
 
 void printDesc(const char *desc) {
-    printf("==== %s ====\n", desc);
+    printf("%s\n", desc);
 }
 
 template<typename T>
@@ -72,5 +72,7 @@ void printArray(int n, int *a, bool abridged = false) {
 template<typename T>
 void printElapsedTime(T time, std::string note = "")
 {
-	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+
+	std::cout << time << std::endl;
+	//std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )