diff --git a/INSTRUCTION.md b/INSTRUCTION.md
index 792c530..8be3652 100644
--- a/INSTRUCTION.md
+++ b/INSTRUCTION.md
@@ -242,7 +242,6 @@ The title should be "Project 2: YOUR NAME".
 The template of the comment section of your pull request is attached below, you can do some copy and paste:  
 
 * [Repo Link](https://link-to-your-repo)
-* `Your PENNKEY`
 * (Briefly) Mentions features that you've completed. Especially those bells and whistles you want to highlight
     * Feature 0
     * Feature 1
diff --git a/README.md b/README.md
index b71c458..3491851 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,109 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Name: Jiahao Liu
+* Tested on: Windows 10, i7-3920XM CPU @ 2.90GHz 3.10 GHz 16GB, GTX 980m SLI 8192MB (personal computer)
 
-### (TODO: Your README)
+Project Description and features implemented
+======================
+
+### Project Description
+
+This project is tend to compare running performance difference in computing prefix sum between CPU scan, naive GPU scan, efficient GPU scan and thrust scan.
+
+### Features implemented
+
+* CPU scan
+
+* Naive GPU scan with shared memory
+
+* Efficient GPU scan
+
+* Thrust scan
+
+Performance Analysis
+======================
+
+After trying I found for all the block size the program has the same performance, so I just chose the largest power of two as my block size.
+
+![](img/1.png)
+
+![](img/2.png)
+
+When includes the time of copy data from host to device, thrust scan and naive scan runs no faster then the CPU scan.
+
+![](img/3.png)
+
+Even with data copy, the efficient scan runs faster then the CPU scan.
+
+![](img/4.png)
+
+Only includes the time for computing, we can see the graph above.  Thrust scan may have some built-in operations that really cost time. Naive scan roughly has more memory access then efficient scan( n^2/2 vs 3*n) but fewer calculating and comparing operations. This means the battleneck is on the memory I/O. We cannot evaluate thrust since we don't even know what happens inside.
+
+There is a very interesting that power-of-two thrust scan really spent some time. Comparing what I know for access array on ram with power-of-two length, this maybe the consequence for access confliction due to the time cost of finding next memory unit on hardware.
+
+### Running Result
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  28  13   1  37  12  43  45  30  45  30  16  35  30 ...  33   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.001324ms    (std::chrono Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9519 9552 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.001655ms    (std::chrono Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9421 9449 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.006976ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9519 9552 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.00688ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9519 9552 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9421 9449 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.025152ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9519 9552 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.0216ms    (CUDA Measured)
+    [   0  28  41  42  79  91 134 179 209 254 284 300 335 ... 9421 9449 ]
+    passed
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   3   3   1   2   2   1   2   0   2   3   3 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.001655ms    (std::chrono Measured)
+    [   1   3   3   1   2   2   1   2   2   3   3   1   2 ...   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.001655ms    (std::chrono Measured)
+    [   1   3   3   1   2   2   1   2   2   3   3   1   2 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.001655ms    (std::chrono Measured)
+    [   1   3   3   1   2   2   1   2   2   3   3   1   2 ...   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.474944ms    (CUDA Measured)
+    [   1   3   3   1   2   2   1   2   2   3   3   1   2 ...   3   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.416544ms    (CUDA Measured)
+    [   1   3   3   1   2   2   1   2   2   3   3   1   2 ...   3   1 ]
+    passed
 
+```
\ No newline at end of file
diff --git a/img/1.png b/img/1.png
new file mode 100644
index 0000000..33c6fd9
Binary files /dev/null and b/img/1.png differ
diff --git a/img/2.png b/img/2.png
new file mode 100644
index 0000000..21c3ed6
Binary files /dev/null and b/img/2.png differ
diff --git a/img/3.png b/img/3.png
new file mode 100644
index 0000000..75a0772
Binary files /dev/null and b/img/3.png differ
diff --git a/img/4.png b/img/4.png
new file mode 100644
index 0000000..79161af
Binary files /dev/null and b/img/4.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..be529f4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 384; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
@@ -49,42 +49,42 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -129,14 +129,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..5c1e416 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,11 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -32,8 +37,12 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n || bools[index] == 0) {
+				return;
+			}
+			odata[indices[index]] = idata[index];
+		}
 
     }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..9b9e12c 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			odata[0] = 0;
+			for (int i(1); i < n; ++i) {
+				odata[i] = odata[i - 1] + idata[i - 1];
+			}
 	        timer().endCpuTimer();
         }
 
@@ -29,10 +33,14 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
-	        timer().endCpuTimer();
-            return -1;
+			int compactSum = 0;
+
+			for (int i(0); i < n; ++i) {
+				if (idata[i] == 0) continue;
+				odata[compactSum++] = idata[i];
+			}
+
+			return compactSum;
         }
 
         /**
@@ -41,10 +49,24 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
-	        timer().endCpuTimer();
-            return -1;
+			int *flag = new int[n];
+			int m = 0;
+
+			for (int i = 0; i < n; ++i) {
+				flag[i] = idata[i] == 0 ? 0 : 1;
+			}
+
+			scan(n, odata, flag);
+			m = odata[n - 1];
+
+			for (int i = 0; i < n; ++i) {
+				if (flag[i] == 0) continue;
+				odata[odata[i]] = idata[i];
+			}
+
+			delete flag;
+
+			return m;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..227b368 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,39 +2,114 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
-
+#define BLOCK_SIZE 896
 namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
-        }
-    }
-}
+	namespace Efficient {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		/**
+		* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		*/
+		__global__ void upSweep(const int n, const int step, int *data) {
+
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			int rIndex = n - 1 - index;
+			if (index - step >= 0 && (rIndex % (step * 2) == 0)) {
+				data[index] = data[index] + data[index - step];
+			}
+			__syncthreads();
+		}
+
+		__global__ void downSweep(const int n, const int step, int *data) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			int rIndex = n - 1 - index;
+			///Only certain index is working.
+			if (index - step >= 0 && (rIndex % (step * 2) == 0) ) {
+				auto tmp = data[index];
+				data[index] += data[index - step];
+				data[index - step] = tmp;
+			}
+			__syncthreads();
+		}
+
+		void scanOnGPU(const int n, int *dev_data) {
+			dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+			int step;
+			for (step = 1; step < n; step <<= 1) {
+				upSweep << <blockCount, BLOCK_SIZE >> >(n, step, dev_data);
+			}
+			cudaMemset(&dev_data[n - 1], 0, sizeof(int));
+			for (step >>= 1; step > 0; step >>= 1) {
+				downSweep << <blockCount, BLOCK_SIZE >> >(n, step, dev_data);
+			}
+		}
+
+		void scan(int n, int *odata, const int *idata) {
+			// TODO
+			int *dev_data;
+			cudaMalloc((void**)&dev_data, sizeof(int) * n);
+			cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			scanOnGPU(n, dev_data);
+			cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			cudaFree(dev_data);
+		}
+
+		/**
+		* Performs stream compaction on idata, storing the result into odata.
+		* All zeroes are discarded.
+		*
+		* @param n      The number of elements in idata.
+		* @param odata  The array into which to store elements.
+		* @param idata  The array of elements to compact.
+		* @returns      The number of elements remaining after compaction.
+		*/
+		int compact(int n, int *odata, const int *idata) {
+			// TODO
+			int count = 0;
+			int *dev_data;
+			int *dev_dataCopy;
+			int *dev_bool;
+			int *dev_indices;
+			for (int i = 0; i < n; ++i)
+				count = count + (idata[i] != 0);
+
+			// device memory allocation
+			timer().startGpuTimer();
+
+			cudaMalloc((void**)&dev_data, sizeof(int) * n);
+			cudaMalloc((void**)&dev_dataCopy, sizeof(int) * n);
+			cudaMalloc((void**)&dev_bool, sizeof(int) * n);
+			cudaMalloc((void**)&dev_indices, sizeof(int) * n);
+			// copy input data to device
+			cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+			Common::kernMapToBoolean << <blockCount, BLOCK_SIZE >> >(n, dev_bool, dev_data);
+			cudaMemcpy((void*)dev_indices, (const void*)dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+			scanOnGPU(n, dev_indices);
+			cudaMemcpy((void*)dev_dataCopy, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+			Common::kernScatter << <blockCount, BLOCK_SIZE >> >(n, dev_data, dev_dataCopy, dev_bool, dev_indices);
+			// copy result to host
+			cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+			timer().endGpuTimer();
+			// free memory on device
+			cudaFree(dev_data);
+			cudaFree(dev_dataCopy);
+			cudaFree(dev_bool);
+			cudaFree(dev_indices);
+
+			return count;
+		}
+	}
+}
\ No newline at end of file
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..05186b4 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,23 +3,96 @@
 #include "common.h"
 #include "naive.h"
 
+#define BLOCK_SIZE 896
 namespace StreamCompaction {
     namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         // TODO: __global__
+		// TODO: __global__
+		__global__ void work(const int n, int *idata, int *odata) {
 
+			extern __shared__ int temp[];
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (index >= n) {
+				return;
+			}
+			/*
+			if (index >= n) {
+				return;
+			}
+
+			if (index - step < 0) {
+				odata[index] = idata[index];
+			} else {
+				odata[index] = idata[index] + idata[index - step];
+			}
+			*/
+
+			int input = 1, output = 0;
+			temp[index] = idata[index];
+			__syncthreads();
+			//33 43 45
+			//0 33 43 45
+			//0 33 76 88
+			//0 33 76 121
+			for (int step = 1; step < n; step <<= 1) {
+				input ^= 1;
+				output ^= 1;
+				if (index - step < 0) {
+					temp[output * n + index] = temp[input * n + index];
+				}
+				else {
+					temp[output * n + index] = temp[input * n + index] + temp[input * n + index - step];
+				}
+				__syncthreads();
+			}
+			odata[index] = temp[(output*n) + index];
+		}
+
+		__global__ void moveToExclusive(const int n, int *idata, int *odata) {
+
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (index >= n) {
+				return;
+			}
+			else if (index == 0) {
+				odata[index] = 0;
+				return;
+			}
+
+			odata[index] = idata[index - 1];
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+//            timer().startGpuTimer();
+			int *dev_input;
+			int *dev_output;
+			int input = 1;
+			int output = 0;
+			// device memory allocation
+			cudaMalloc((void**)&dev_input, sizeof(int) * n);
+			cudaMalloc((void**)&dev_output, sizeof(int) * n);
+			cudaMemcpy(dev_output, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+			moveToExclusive << <blockCount, BLOCK_SIZE >> >(n, dev_output, dev_input);
+			//We want exclusive result. Not inclusive.
+			timer().startGpuTimer();
+			work << <blockCount, BLOCK_SIZE,2 * n * sizeof(int) >> >(n, dev_input, dev_output);
+			timer().endGpuTimer();
+			cudaDeviceSynchronize();
+			cudaMemcpy(odata, dev_output, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			cudaFree(dev_input);
+			cudaFree(dev_output);
+//            timer().endGpuTimer();
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..523515b 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,27 @@
 
 namespace StreamCompaction {
     namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+//            timer().startGpuTimer();
+			int *dev_data;
+			cudaMalloc((void**)&dev_data, sizeof(int) * n);
+			cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			thrust::device_ptr<int> dev_thrust_data(dev_data);
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
+			thrust::exclusive_scan(dev_thrust_data, dev_thrust_data + n, dev_thrust_data);
+			timer().endGpuTimer();
+			cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			cudaFree(dev_data);
+//           timer().endGpuTimer();
         }
     }
 }