decred · davecgh · Sep 6, 2023 · Sep 4, 2023 · Sep 4, 2023 · Sep 4, 2023
diff --git a/GNUmakefile b/GNUmakefile
@@ -17,7 +17,7 @@ endif
 #
 # Windows build assumes that CUDA V7.0 is installed in its default location.
 #
-# Windows gominer requires nvml.dll and decred.dll to reside in the same
+# Windows gominer requires nvml.dll and blake3-decred.dll to reside in the same
 # directory as gominer.exe.
 ifeq ($(ARCH),Msys)
 obj: nvidia
@@ -30,24 +30,24 @@ endif
 	mkdir obj
 
 ifeq ($(ARCH),Msys)
-obj/decred.dll: obj sph/blake.c decred.cu
-	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. -Isph decred.cu sph/blake.c -o obj/decred.dll
+obj/blake3-decred.dll: obj blake3.cu
+	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. blake3.cu -o obj/blake3-decred.dll
 else
-obj/decred.a: obj sph/blake.c decred.cu
-	$(NVCC) --lib --optimize=3 -I. decred.cu sph/blake.c -o obj/decred.a
+obj/blake3.a: obj blake3.cu
+	$(NVCC) --lib --optimize=3 -I. blake3.cu -o obj/blake3.a
 endif
 
 ifeq ($(ARCH),Msys)
-build: obj/decred.dll
+build: obj/blake3-decred.dll
 else
-build: obj/decred.a
+build: obj/blake3.a
 endif
 	go build -tags 'cuda'
 
 ifeq ($(ARCH),Msys)
-install: obj/decred.dll
+install: obj/blake3-decred.dll
 else
-install: obj/decred.a
+install: obj/blake3.a
 endif
 	go install -tags 'cuda'
 

diff --git a/README.md b/README.md
@@ -78,9 +78,6 @@ $ curl http://localhost:3333/
 
 #### Pre-Requisites
 
-NOTE: The CUDA support has NOT been updated yet for BLAKE3.  Matheus is working
-on adding support, so this section hasn't been modified, but it is out of date.
-
 You will either need to install CUDA for NVIDIA graphics cards or OpenCL
 library/headers that support your device such as: AMDGPU-PRO (for newer AMD
 cards), Beignet (for Intel Graphics), or Catalyst (for older AMD cards).
@@ -89,12 +86,11 @@ For example, on Ubuntu 23.04 you can install the necessary OpenCL packages (for
 Intel Graphics) and CUDA libraries with:
 
 ```
-sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
+sudo apt-get install nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
 gominer has been built successfully on Ubuntu 23.04 with go1.21.0,
-g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
-well.
+g++ 5.4.0 although other combinations should work as well.
 
 #### Instructions
 
@@ -135,6 +131,9 @@ go build -tags opencladl
 
 ##### CUDA
 
+**NOTE**: The CUDA version of the Blake3 gominer is not yet compatible to
+windows.
+
 ###### Pre-Requisites
 
 - Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)

diff --git a/blake3.cu b/blake3.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Decred BLAKE3 midstate-based CUDA kernel
+
+// Written and optimized by Dave Collins Sep 2023.
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif /* _WIN32 */
+
+#define MAX_OUTPUT_RESULTS 32
+
+// Written and optimized by Dave Collins Sep 2023.
+#define ROTR(v, n) __funnelshift_rc((v), (v), n)
+
+__global__
+void search(
+    uint *output,
+    // Midstate.
+    const uint cv0,
+    const uint cv1,
+    const uint cv2,
+    const uint cv3,
+    const uint cv4,
+    const uint cv5,
+    const uint cv6,
+    const uint cv7,
+
+    // Final 52 bytes of data.
+    const uint m0,
+    const uint m1,
+    const uint m2,
+    // const uint m3 : nonce
+    const uint m4,
+    const uint m5,
+    const uint m6,
+    const uint m7,
+    const uint m8,
+    const uint m9,
+    const uint m10,
+    const uint m11,
+    const uint m12)
+{
+    // Nonce.
+    const uint m3 = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // BLAKE3 init vectors.
+    const uint iv0 = 0x6a09e667ul;
+    const uint iv1 = 0xbb67ae85ul;
+    const uint iv2 = 0x3c6ef372ul;
+    const uint iv3 = 0xa54ff53aul;
+    // const uint iv4 = 0x510e527ful;
+    // const uint iv5 = 0x9b05688cul;
+    // const uint iv6 = 0x1f83d9abul;
+    // const uint iv7 = 0x5be0cd19ul;
+
+    // Internal compression func state.
+    uint v0, v1, v2, v3, v4, v5, v6, v7;
+    uint v8, v9, v10, v11, v12, v13, v14, v15;
+
+    // Do the initialization and first round together.
+    // Round 1.
+    v0 = cv0 + cv4 + m0; v12 = ROTR(v0, 16); v8 = iv0 + v12; v4 = ROTR(cv4 ^ v8, 12); v0 += v4 + m1; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = cv1 + cv5 + m2; v13 = ROTR(v1, 16); v9 = iv1 + v13; v5 = ROTR(cv5 ^ v9, 12); v1 += v5 + m3; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = cv2 + cv6 + m4; v14 = ROTR(52 ^ v2, 16); v10 = iv2 + v14; v6 = ROTR(cv6 ^ v10, 12); v2 += v6 + m5; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = cv3 + cv7 + m6; v15 = ROTR(10 ^ v3, 16); v11 = iv3 + v15; v7 = ROTR(cv7 ^ v11, 12); v3 += v7 + m7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m8; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m9; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m10; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m11; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m12; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 2 with message word permutation.
+    v0 = v0 + v4 + m2; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m6; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m3; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m10; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m7; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m0; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m4; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m1; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m11; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m12; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m5; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m9; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m8; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 3 with message word permutation.
+    v0 = v0 + v4 + m3; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m10; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m12; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m2; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m6; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m5; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m9; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m0; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m11; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m8; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m1; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 4 with message word permutation.
+    v0 = v0 + v4 + m10; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m7; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m12; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m9; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m3; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m4; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m0; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m11; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m2; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m5; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m8; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m1; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m6; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 5 with message word permutation.
+    v0 = v0 + v4 + m12; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m9; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m11; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m10; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m8; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m7; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m2; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m5; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m3; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m0; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m1; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m6; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 6 with message word permutation.
+    v0 = v0 + v4 + m9; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m11; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m5; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m8; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m12; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m1; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m3; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m0; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m10; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m2; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m6; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m7; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 7 with message word permutation.
+    v0 = v0 + v4 + m11; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m5; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m0; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m1; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m9; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m8; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m6; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m10; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m2; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m12; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m3; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m4; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m7; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Finally the truncated 256-bit output is defined as:
+    //
+    // h'0 = v0^v8
+    // h'1 = v1^v9
+    // h'2 = v2^v10
+    // h'3 = v3^v11
+    // h'4 = v4^v12
+    // h'5 = v5^v13
+    // h'6 = v6^v14
+    // h'7 = v7^v15
+    //
+    // Just check if the last word (32-bits) is zeroed and return back to the
+    // miner to notify it that a potential solution was found so it can check it
+    // against the target difficulty.
+
+    // Debug code to print result of hashing function.
+    // if (!((v7 ^ v15) & 0xFFFF0000)) {
+    //     printf("hash on gpu %x %x %x %x %x %x %x %x\n",
+    //         v0 ^ v8, v1 ^ v9, v2 ^ v10, v3 ^ v11,
+    //         v4 ^ v12, v5 ^ v13, v6 ^ v14, v7 ^ v15);
+    //     printf("nonce for hash on gpu %x\n", m3);
+    // }
+
+    if (v7 ^ v15)
+        return;
+
+    // Update nonce.
+    uint pos = atomicInc(&output[0], 0xffffffff)+1;
+    if (pos > MAX_OUTPUT_RESULTS) return; // Bounds check output buffer.
+    output[pos] = m3;
+}
+
+
+extern "C" {
+__host__ DLLEXPORT void
+decred_blake3_hash(const uint32_t dimgrid, const uint32_t threads, uint32_t *cv, uint32_t *m, uint32_t *out)
+{
+	search<<<dimgrid, threads>>>(
+		out,
+		cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
+		m[0], m[1], m[2],
+		// m3,
+		m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12]
+
+	);
+}
+}
diff --git a/cgo_flags.go b/cgo_flags.go
@@ -6,7 +6,7 @@
 package main
 
 /*
-#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/decred.a
-#cgo windows LDFLAGS: -Lobj -ldecred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
+#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/blake3.a
+#cgo windows LDFLAGS: -Lobj -lblake3-decred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
 */
 import "C"
diff --git a/compat.h b/compat.h