Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project2-Ju Yang #26

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
68 changes: 62 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,69 @@
CUDA Stream Compaction
======================

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
###University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
Ju Yang

### (TODO: Your README)
### Tested on: Windows 7, i7-4710MQ @ 2.50GHz 8GB, GTX 870M 6870MB (Hasee Notebook K770E-i7)
![result](doc/1024.png)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
## TODOs finished:
### 1. naive.cu

__global__ void naive_sum(int n,int* odata, int* idata);

void scan(int n, int *odata, const int *idata);

### 2. efficient.cu

__global__ void prescan(int *g_odata, int *g_idata, int n, int*temp);

void scan(int n, int *odata, const int *idata);

int compact(int n, int *odata, const int *idata);


### 3 thrust.cu

void scan(int n, int *odata, const int *idata);


### 4 cpu.cu

void scan(int n, int *odata, const int *idata);

int compactWithoutScan(int n, int *odata, const int *idata) ;

int compactWithScan(int n, int *odata, const int *idata);

### 5 common.cu

__global__ void kernMapToBoolean(int n, int *bools, const int *idata);

__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);

### Modified the main.cpp a little bit for display.

## Performance Graph

### Scanning
![result](doc/image001.gif)
### Thrust Scanning
![result](doc/data_29123_image001.gif)
### Compact
![result](doc/data_6317_image001.gif)

## Analysis
### Thrust
As we can see, the thrust::exclusive_scan is rather time-costing compared with other methods. Even if I used device_vector to store the data, it is still the slowest.
But since I did not free the device_vectors, the non-pow2 as second round's speed is much faster.
I think the reason is, when calling thrust functions, it will apply for some blocks/threads inside the GPU, and will release later on.
Although I tried my best to avoid any read/write from CPU to GPU, the scan function still cost some time to arrange for some place.

### Unfixed Known Bugs
#### 1. When using multiple blocks, sometimes the result is not right. I think it is because __syncthreads() doesn't sync blocks?
#### 2. Since I used only 1 block, when the SIZE is more than 1024(which is the limit), apperently the result is wrong.
#### 3. CPU performace is much better, and sometimes the calculating time doesn't always raise with the SIZE.
I think this is because the SIZE is still not large enough?
Binary file added doc/1024.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/128.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/256.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/32.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/512.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/64.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/data.xls
Binary file not shown.
Binary file added doc/data_29123_image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/data_6317_image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 8 additions & 5 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array


const int SIZE = 1 << 10; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int a[SIZE], b[SIZE], c[SIZE];

int main(int argc, char* argv[]) {

// Scan tests
printf("Size= %d, Non-Pow2 Size= %d. \n", SIZE, NPOT);

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");
Expand All @@ -42,7 +45,7 @@ int main(int argc, char* argv[]) {
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(NPOT, b, true);
//printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
Expand Down Expand Up @@ -115,14 +118,14 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedNPOT = count;
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
Expand Down
16 changes: 16 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = threadIdx.x;
if (idata[index] == 0)//If this is 0
{
bools[index] = 0;
}
else
{
bools[index] = 1;
}
}

/**
Expand All @@ -33,6 +42,13 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO

int index = threadIdx.x;

if (bools[index]!=0)
{
odata[indices[index]] = idata[index];
}
}

}
Expand Down
52 changes: 44 additions & 8 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#include <cstdio>
#include "cpu.h"

#include "common.h"
#include "common.h"

namespace StreamCompaction {
namespace CPU {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

/**
Expand All @@ -20,6 +20,14 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count=0;

for (i = 0; i < n; i++)
{
count += idata[i];
odata[i] = count;
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +39,21 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count = 0;
for (i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[i-count] = idata[i];
}
else
{
count++;
}
}
timer().endCpuTimer();
return -1;
return n-count;
}

/**
Expand All @@ -43,8 +64,23 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count=0;

for (i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[i - count] = idata[i];
}
else
{
count++;
}
}

timer().endCpuTimer();
return -1;
return n- count;
}
}
}
Loading