diff --git a/README.md b/README.md
index 41b91f0..114a5fb 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,47 @@
 CUDA Rasterizer
 ===============
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4 - Rasterizer**
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
+* Liam Dugan
+  * [LinkedIn](https://www.linkedin.com/in/liam-dugan-95a961135/), [personal website](http://liamdugan.com/)
+* Tested on: Windows 10, Ryzen 5 1600 @ 3.20GHz 16GB, GTX 1070 16GB (Personal Computer)
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
+![](images/duck.gif)
 
-* (TODO) YOUR NAME HERE
-	* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+What is Rasterization?
+=============
 
-### (TODO: Your README)
+<img align="right" src="images/rasterization.png">
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+Rasterization, unlike path or ray tracing has no concept of light rays and is therefore a much faster method of rendering 3D models to the screen. 
+
+A rasterizer pipeline has 5 basic stages:
+1. **Vertex Shader** -- Transforms the vertices to the correct coordinates
+2. **Primitive Assembly** -- Assembles the vertices into triangles
+3. **Rasterization** -- Determine what pixels on the screen correspond to each triangle
+4. **Fragment Shader** -- For each pixel, determine the color and lighting effect to use
+5. **Output** -- Write the color values for each pixel to the output
+
+Each of these stages is **Embarassingly Parallel** so rasterization is sped up greatly with the help of the GPU.
+
+For example, instead of looping over each pixel for every triangle in a scene sequentially, we can instead create one thread per triangle and have all threads loop over only their own triangle's bounding box. This allows rasterization to give smooth real time rendering of 3D models.
+
+Extra Features
+=============
+
+UV texture mapping & Super-Sample Antialiasing
+-------
+
+![](images/truck.png)
+
+By interpolating between the texture coordinates of each vector we are able to get a mapping from a texture onto our model. Additionally, if we render the model at x4 resolution and then scale it down, we can get an image that is of much higher quality and has much fewer artifacts.
+
+Color interpolation
+-------
+![](images/cowGray.png)
+![](images/cowMulti.png)
+
+By assigning the color of a given pixel to be equal to it's barycentric coordinate with respect to a triangle, we get a very nice multicolor effect that shows off how many triangles there are in these models.
 
 
 ### Credits
diff --git a/images/cowGray.png b/images/cowGray.png
new file mode 100644
index 0000000..a9e38b1
Binary files /dev/null and b/images/cowGray.png differ
diff --git a/images/cowMulti.png b/images/cowMulti.png
new file mode 100644
index 0000000..8a1b2cd
Binary files /dev/null and b/images/cowMulti.png differ
diff --git a/images/duck.gif b/images/duck.gif
new file mode 100644
index 0000000..3a18842
Binary files /dev/null and b/images/duck.gif differ
diff --git a/images/rasterization.png b/images/rasterization.png
new file mode 100644
index 0000000..1d5cef6
Binary files /dev/null and b/images/rasterization.png differ
diff --git a/images/truck.png b/images/truck.png
new file mode 100644
index 0000000..642df6d
Binary files /dev/null and b/images/truck.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..00edee0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..d1b311e 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -18,6 +18,11 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+#define Z_VALUE_PRECISION 10000.0f
+#define PERSPECTIVE_COLOR 0
+#define INTENSITY 1.0f
+#define SCALE_FACTOR 2
+
 namespace {
 
 	typedef unsigned short VertexIndex;
@@ -43,11 +48,10 @@ namespace {
 
 		 glm::vec3 eyePos;	// eye space position used for shading
 		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
+		 glm::vec3 col;
 		 glm::vec2 texcoord0;
 		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
-		// ...
+		 int texWidth, texHeight;
 	};
 
 	struct Primitive {
@@ -62,10 +66,10 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
+	  glm::vec3 eyePos;	// eye space position used for shading
+	  glm::vec3 eyeNor;
+		VertexAttributeTexcoord texcoord0;
+		TextureData* dev_diffuseTex;
 		// ...
 	};
 
@@ -100,7 +104,6 @@ namespace {
 
 static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
 
-
 static int width = 0;
 static int height = 0;
 
@@ -111,62 +114,23 @@ static glm::vec3 *dev_framebuffer = NULL;
 
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
 
-/**
- * Kernel that writes the image to the OpenGL PBO directly.
- */
-__global__ 
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
-}
-
-/** 
-* Writes fragment colors to the framebuffer
-*/
-__global__
-void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
-
-		// TODO: add your fragment shader code here
-
-    }
-}
-
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
-	cudaFree(dev_fragmentBuffer);
-	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+    width = w * SCALE_FACTOR;
+    height = h * SCALE_FACTOR;
+	  cudaFree(dev_fragmentBuffer);
+  	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
+  	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
     cudaFree(dev_framebuffer);
     cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
     cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
-	cudaFree(dev_depth);
-	cudaMalloc(&dev_depth, width * height * sizeof(int));
+     
+  	cudaFree(dev_depth);
+  	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
-	checkCUDAError("rasterizeInit");
+  	checkCUDAError("rasterizeInit");
 }
 
 __global__
@@ -621,8 +585,6 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 }
 
-
-
 __global__ 
 void _vertexTransformAndAssembly(
 	int numVertices, 
@@ -634,19 +596,39 @@ void _vertexTransformAndAssembly(
 	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
 	if (vid < numVertices) {
 
-		// TODO: Apply vertex transformation here
-		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
-		// Then divide the pos by its w element to transform into NDC space
-		// Finally transform x and y to viewport space
-
-		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
-		
+    // get the position vectors
+    glm::vec4 position = glm::vec4(primitive.dev_position[vid], 1.0f);
+    glm::vec3 normal = primitive.dev_normal[vid];
+    glm::vec3 eyeNorm = glm::normalize(MV_normal * normal);
+    glm::vec4 eyePos = MV * position;
+
+    // ensure position is correctly in viewport coords
+    position = MVP * position;
+    position = position / position.w; // is this -1 to 1 ?
+    position.x = 0.5f * (float)width * (position.x + 1.0f);
+    position.y = 0.5f * (float)height * (1.0f - position.y);
+    position.z = 0;
+
+    // set the output buffer
+    primitive.dev_verticesOut[vid].pos = position;
+    primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight;
+    primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth;
+    primitive.dev_verticesOut[vid].eyePos = glm::vec3(eyePos);
+    primitive.dev_verticesOut[vid].eyeNor = eyeNorm;
+
+    if (primitive.dev_texcoord0 != NULL && primitive.dev_diffuseTex != NULL)
+    {
+      primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid];
+      primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex;
+    } 
+    else
+    {
+      primitive.dev_verticesOut[vid].texcoord0 = glm::vec2(0, 0);
+      primitive.dev_verticesOut[vid].dev_diffuseTex = NULL;
+    }
 	}
 }
 
-
-
 static int curPrimitiveBeginId = 0;
 
 __global__ 
@@ -656,36 +638,152 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
 
 	if (iid < numIndices) {
-
-		// TODO: uncomment the following code for a start
-		// This is primitive assembly for triangles
-
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
-
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 		// TODO: other primitive types (point, line)
 	}
 	
 }
 
+/**
+* Writes fragment colors to the framebuffer
+*/
+__device__
+void render(int index, const Fragment& me, glm::vec3 *framebuffer) 
+{
+  float intensity = glm::dot(me.eyeNor, glm::vec3(0.0f, 0.0f, 1.0f)) * INTENSITY;
+  glm::vec3 lambertianColor = glm::vec3(intensity*me.color.r, intensity*me.color.g, intensity*me.color.b);
+  framebuffer[index] = lambertianColor;
+}
+
+__global__ void _rasterize(int w, int h, int numPrimitives, Primitive* dev_primitives, Fragment* dev_fragmentBuffer, glm::vec3 *dev_framebuffer, int* dev_depth)
+{
+  // primitive index
+  int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (iid < numPrimitives) 
+  {
+    const Primitive& me = dev_primitives[iid];
+    const glm::vec3 tri[3] = { glm::vec3(me.v[0].pos), glm::vec3(me.v[1].pos), glm::vec3(me.v[2].pos) };
+    const AABB boundingBox = getAABBForTriangle(tri);
+
+    for (int x = boundingBox.min.x; x < boundingBox.max.x; ++x)
+    {
+      for (int y = boundingBox.min.y; y < boundingBox.max.y; ++y)
+      {
+        // calculate the barycentric coords
+        const glm::vec2 pixel(x, y);
+        const glm::vec3 baryCoord = calculateBarycentricCoordinate(tri, pixel);
+ 
+        // if our pixel is within a triangle
+        if (isBarycentricCoordInBounds(baryCoord)) 
+        {
+  
+          // perform the depth test
+          const glm::vec3 eyePosTri[3] = { glm::vec3(me.v[0].eyePos), glm::vec3(me.v[1].eyePos), glm::vec3(me.v[2].eyePos) };
+          float zValue = getZAtCoordinate(baryCoord, eyePosTri);
+          int z = zValue * Z_VALUE_PRECISION;
+          atomicMin(&dev_depth[x + y*w], z);
+          int index = x + y*w;
+
+          // if we pass the depth test, fill in the fragment and call the fragment shader
+          if (z == dev_depth[index])
+          {
+            Fragment& thisFrag = dev_fragmentBuffer[x + y*w];
+
+            // barycentric interpolate to get eyePos
+            thisFrag.eyePos = baryCoord.x * me.v[0].eyePos + baryCoord.y * me.v[1].eyePos + baryCoord.z * me.v[2].eyePos;
+            thisFrag.eyeNor = baryCoord.x * me.v[0].eyeNor + baryCoord.y * me.v[1].eyeNor + baryCoord.z * me.v[2].eyeNor;
+
+            // if we are dealing with a texture then interpolate texcoord
+            if (me.v[0].dev_diffuseTex != NULL)
+            {
+              thisFrag.texcoord0 = baryCoord.x * me.v[0].texcoord0 + baryCoord.y * me.v[1].texcoord0 + baryCoord.z * me.v[2].texcoord0;
+              thisFrag.texcoord0.x *= me.v[0].texWidth;
+              thisFrag.texcoord0.y *= me.v[0].texHeight;
+              thisFrag.dev_diffuseTex = me.v[0].dev_diffuseTex;
+
+              // multiply by 3 for RBG stride
+              int texIndex = (((int)thisFrag.texcoord0.x) + ((int)thisFrag.texcoord0.y) * me.v[0].texWidth) * 3;
+
+              // get the texture data
+              TextureData redVal = thisFrag.dev_diffuseTex[texIndex];
+              TextureData greenVal = thisFrag.dev_diffuseTex[texIndex + 1];
+              TextureData blueVal = thisFrag.dev_diffuseTex[texIndex + 2];
+
+              // assign the appropriate color
+              thisFrag.color = glm::vec3(((float) redVal) / 255.0f, ((float) greenVal) / 255.0f, ((float) blueVal) / 255.0f);
+            }
+            else if (PERSPECTIVE_COLOR)
+            {
+              // interpolate between colors
+              thisFrag.color = baryCoord;
+            }
+            else
+            {
+              thisFrag.color = glm::vec3(0.3f, 0.3f, 0.3f);
+            }
+
+            // call the fragment shader
+            render(index, thisFrag, dev_framebuffer);
+          }
+        }
+      }
+    }
+  }
+}
 
+/**
+* Kernel that writes the image to the OpenGL PBO directly.
+*/
+__global__
+void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+  if (x < (w / SCALE_FACTOR) && y < (h / SCALE_FACTOR)) {
+    glm::vec3 color;
+    // average the red green and blue
+    float scaleSquared = SCALE_FACTOR * SCALE_FACTOR;
+
+    for (int i = 0; i < SCALE_FACTOR; ++i)
+    {
+      for (int j = 0; j < SCALE_FACTOR; ++j)
+      {
+        color.r += glm::clamp(image[(x*SCALE_FACTOR + i) + (y*SCALE_FACTOR + j) * w].x, 0.0f, 1.0f) * 255.0;
+        color.g += glm::clamp(image[(x*SCALE_FACTOR + i) + (y*SCALE_FACTOR + j) * w].y, 0.0f, 1.0f) * 255.0;
+        color.b += glm::clamp(image[(x*SCALE_FACTOR + i) + (y*SCALE_FACTOR + j) * w].z, 0.0f, 1.0f) * 255.0;
+      }
+    }
+    
+    int index = x + (y * (w / SCALE_FACTOR));
+
+    // Each thread writes one pixel location in the texture (textel)
+    pbo[index].w = 0;
+    pbo[index].x = color.x / scaleSquared;
+    pbo[index].y = color.y / scaleSquared;
+    pbo[index].z = color.z / scaleSquared;
+  }
+}
 
 /**
  * Perform rasterization.
  */
-void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
+void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) 
+{
+  int sideLength2d = 8;
+  dim3 blockSize2d(sideLength2d, sideLength2d);
+  dim3 blockCount2d((width  - 1) / blockSize2d.x + 1, (height - 1) / blockSize2d.y + 1);
+  dim3 blockCount1d((width*height - 1) / sideLength2d + 1);
 
-	// Execute your rasterization pipeline here
-	// (See README for rasterization pipeline outline.)
+  cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+  cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+  initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
 
 	// Vertex Process & primitive assembly
 	{
@@ -701,15 +799,13 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 			for (; p != pEnd; ++p) {
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-
+        
 				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
 				checkCUDAError("Vertex Processing");
+
 				cudaDeviceSynchronize();
-				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
+				
+        _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >(p->numIndices, curPrimitiveBeginId, dev_primitives, *p);
 				checkCUDAError("Primitive Assembly");
 
 				curPrimitiveBeginId += p->numPrimitives;
@@ -718,20 +814,22 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
+
+
+
+  // number of blocks / size of blocks should be proportional to numPrimitives
+  int blockSize = 32; // size of a warp
+  int numBlocks = (totalNumPrimitives / blockSize) + 1;
 	
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
+  _rasterize << <numBlocks, blockSize >> > (width, height, totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_framebuffer, dev_depth);
+  checkCUDAError("rasterizer");
 
+  cudaDeviceSynchronize();
 
+  // Copy framebuffer into OpenGL buffer for OpenGL previewing
+  sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
+  checkCUDAError("copy render result to pbo");
 
-    // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
-	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
 }
 
 /**