MoneroOcean · Spudz76 · Jul 7, 2020
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Main reasons why this plugin is separated project is:
 
 ## Windows usage
 
-* [Download](https://github.com/xmrig/xmrig-cuda/releases) plugin, you must choose CUDA version, usually it recent version (CUDA 10.1), but builds with older CUDA version also provided, alternative you can build the plugin from source (requiring GCC 9).
+* [Download](https://github.com/MoneroOcean/xmrig-cuda/releases) plugin, you must choose CUDA version, usually it recent version (CUDA 10.1), but builds with older CUDA version also provided, alternative you can build the plugin from source (requiring GCC 9).
 * Place **`xmrig-cuda.dll`** and other dll files near to **`xmrig.exe`**.
 * Edit **`config.json`** to enable CUDA support.
 ```

diff --git a/src/RandomX/hash.hpp b/src/RandomX/hash.hpp
@@ -39,9 +39,13 @@ void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, ui
 //        CUDA_CHECK_KERNEL(ctx->device_id, sipesh<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
 //        k12(input, inputSize, tempHash);
 //        CUDA_CHECK_KERNEL(ctx->device_id, k12<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
-    } else {
-        CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
-    }
+//=======
+//        rx_blake2b_wrapper::run(out, outlen, in, inlen);
+//        yespower_params_t params = { YESPOWER_1_0, 2048, 8, NULL };
+//        if (yespower_tls((const uint8_t *)out, outlen, &params, (yespower_binary_t *)out)) return -1;
+//        return KangarooTwelve((const unsigned char *)out, outlen, (unsigned char *)out, 32, 0, 0);
+    } else
+    CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
     CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));
     CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_rx_rounding, 0, batch_size * sizeof(uint32_t)));
 

diff --git a/src/RandomX/randomx_cuda.hpp b/src/RandomX/randomx_cuda.hpp
@@ -316,12 +316,14 @@ __device__ void print_inst(uint2 inst)
 		}
 		opcode -= RANDOMX_FREQ_IROR_R;
 
+#if RANDOMX_FREQ_IROL_R > 0
 		if (opcode < RANDOMX_FREQ_IROL_R)
 		{
 			printf("%s%sIROL_R   r%u, r%u    ", branch_target, fp_inst, dst, src);
 			break;
 		}
 		opcode -= RANDOMX_FREQ_IROL_R;
+#endif
 
 		if (opcode < RANDOMX_FREQ_ISWAP_R)
 		{

diff --git a/src/crypto/cn/CnAlgo.h b/src/crypto/cn/CnAlgo.h
@@ -142,12 +142,16 @@ template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::iterations()
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_TLO>::iterations() const     { return CN_ITER / 8; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_CCX>::iterations() const          { return CN_ITER / 2; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() const         { return CN_ITER / 32; }
+#ifdef XMRIG_ALGO_CN_GPU
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::iterations() const          { return 0xC000; }
+#endif
 
 
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const             { return 0x1FFF0; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const               { return 0x1FFF0; }
+#ifdef XMRIG_ALGO_CN_GPU
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::mask() const                { return 0x1FFFC0; }
+#endif
 
 
 } /* namespace xmrig_cuda */

diff --git a/src/crypto/common/Algorithm.cpp b/src/crypto/common/Algorithm.cpp
@@ -46,8 +46,8 @@ xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
         CN_GPU,
 #       endif
 #       ifdef XMRIG_ALGO_RANDOMX
-        RX_XLA,
         RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA,
+        RX_XLA,
 #       endif
 #       ifdef XMRIG_ALGO_ARGON2
         AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,

diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h
@@ -61,13 +61,12 @@ class Algorithm
         RX_GRAFT        = 0x72151267,   // "rx/graft"         RandomGRAFT (Graft).
         RX_SFX          = 0x72151273,   // "rx/sfx"           RandomSFX (Safex Cash).
         RX_KEVA         = 0x7214116b,   // "rx/keva"          RandomKEVA (Keva).
+        RX_XLA          = 0x721211ff,   // "panthera"         Panthera (Scala2).
         AR2_CHUKWA      = 0x61130000,   // "argon2/chukwa"    Argon2id (Chukwa).
         AR2_CHUKWA_V2   = 0x61140000,   // "argon2/chukwav2"  Argon2id (Chukwa v2).
         AR2_WRKZ        = 0x61120000,   // "argon2/wrkz"      Argon2id (WRKZ)
         ASTROBWT_DERO   = 0x41000000,   // "astrobwt"         AstroBWT (Dero)
         KAWPOW_RVN      = 0x6b0f0000,   // "kawpow/rvn"       KawPow (RVN)
-
-        RX_XLA          = 0x721211ff,   // "panthera"         Panthera (Scala2).
     };
 
     enum Family : uint32_t {

diff --git a/src/cryptonight.h b/src/cryptonight.h
@@ -138,8 +138,8 @@ namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t ta
 namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
 namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
 namespace RandomX_Keva    { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_DefyX   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
 namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_DefyX   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
 
 void astrobwt_prepare(nvid_ctx *ctx, uint32_t batch_size);
 

diff --git a/src/cuda_core.cu b/src/cuda_core.cu
@@ -79,7 +79,9 @@ static inline void compat_usleep(int waitTime)
 #include "cuda_device.hpp"
 #include "cuda_fast_int_math_v2.hpp"
 #include "cuda_fast_div_heavy.hpp"
+#ifdef XMRIG_ALGO_CN_GPU
 #include "cuda_cryptonight_gpu.hpp"
+#endif
 
 #if defined(__x86_64__) || defined(_M_AMD64) || defined(__LP64__)
 #   define _ASM_PTR_ "l"
@@ -508,7 +510,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 
     float conc_var;
     if (ALGO == Algorithm::CN_CCX) {
-        conc_var = (partidx != 0) ? int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)) : 0.0f;
+        conc_var = (partidx != 0) ? __int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)) : 0.0f;
     }
 
     #pragma unroll 2
@@ -546,9 +548,9 @@ __global__ void cryptonight_core_gpu_phase2_quad(
                 uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
 
                 if (ALGO == Algorithm::CN_CCX) {
-                    float r = int2float((int32_t)x_0) + conc_var;
-                    r = int_as_float((float_as_int(r * r * r) & 0x807FFFFF) | 0x40000000);
-                    x_0 ^= (int32_t)(int_as_float((float_as_int(conc_var) & 0x807FFFFF) | 0x40000000) * 536870880.0f);
+                    float r = __int2float_rn((int32_t)x_0) + conc_var;
+                    r = __int_as_float((__float_as_int(r * r * r) & 0x807FFFFF) | 0x40000000);
+                    x_0 ^= (int32_t)(__int_as_float((__float_as_int(conc_var) & 0x807FFFFF) | 0x40000000) * 536870880.0f);
                     conc_var += r;
                 }
 
@@ -639,7 +641,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(
             }
         }
         if (ALGO == Algorithm::CN_CCX) {
-            *(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
+            *(d_ctx_b + threads * 4 + thread * 4 + sub) = __float_as_int(conc_var);
         }
     }
 }
@@ -818,6 +820,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 }
 
 
+#ifdef XMRIG_ALGO_CN_GPU
 template<xmrig_cuda::Algorithm::Id ALGO>
 void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
 {
@@ -883,6 +886,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
             ctx->d_ctx_state, ctx->d_ctx_key2 ));
     }
 }
+#endif
 
 void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm, uint64_t height, uint32_t startNonce)
 {
@@ -962,9 +966,11 @@ void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm,
             cryptonight_core_gpu_hash<Algorithm::CN_CCX>(ctx, startNonce);
             break;
 
+#       ifdef XMRIG_ALGO_CN_GPU
         case Algorithm::CN_GPU:
             cryptonight_core_gpu_hash_gpu<Algorithm::CN_GPU>(ctx, startNonce);
             break;
+#       endif
 
         default:
             break;
@@ -980,9 +986,11 @@ void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm,
             cryptonight_core_gpu_hash<Algorithm::CN_LITE_1>(ctx, startNonce);
             break;
 
+#       ifdef XMRIG_ALGO_CN_GPU
         case Algorithm::CN_GPU:
             cryptonight_core_gpu_hash_gpu<Algorithm::CN_GPU>(ctx, startNonce);
             break;
+#       endif
 
         default:
             break;

diff --git a/src/cuda_cryptonight_gpu.hpp b/src/cuda_cryptonight_gpu.hpp
@@ -81,10 +81,10 @@ struct __m128 : public float4
 
     __forceinline__ __device__ __m128( const __m128i& x0)
     {
-        float4::x = int2float(x0.x);
-        float4::y = int2float(x0.y);
-        float4::z = int2float(x0.z);
-        float4::w = int2float(x0.w);
+        float4::x = __int2float_rn(x0.x);
+        float4::y = __int2float_rn(x0.y);
+        float4::z = __int2float_rn(x0.z);
+        float4::w = __int2float_rn(x0.w);
     }
 
     __forceinline__ __device__ __m128i get_int( )
@@ -217,30 +217,30 @@ __forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b)
 __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
 {
     return __m128(
-        int_as_float(float_as_int(a.x) & b),
-        int_as_float(float_as_int(a.y) & b),
-        int_as_float(float_as_int(a.z) & b),
-        int_as_float(float_as_int(a.w) & b)
+        __int_as_float(__float_as_int(a.x) & b),
+        __int_as_float(__float_as_int(a.y) & b),
+        __int_as_float(__float_as_int(a.z) & b),
+        __int_as_float(__float_as_int(a.w) & b)
     );
 }
 
 __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
 {
     return __m128(
-        int_as_float(float_as_int(a.x) | b),
-        int_as_float(float_as_int(a.y) | b),
-        int_as_float(float_as_int(a.z) | b),
-        int_as_float(float_as_int(a.w) | b)
+        __int_as_float(__float_as_int(a.x) | b),
+        __int_as_float(__float_as_int(a.y) | b),
+        __int_as_float(__float_as_int(a.z) | b),
+        __int_as_float(__float_as_int(a.w) | b)
     );
 }
 
 __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
 {
     return __m128(
-        int_as_float(float_as_int(a.x) ^ b),
-        int_as_float(float_as_int(a.y) ^ b),
-        int_as_float(float_as_int(a.z) ^ b),
-        int_as_float(float_as_int(a.w) ^ b)
+        __int_as_float(__float_as_int(a.x) ^ b),
+        __int_as_float(__float_as_int(a.y) ^ b),
+        __int_as_float(__float_as_int(a.z) ^ b),
+        __int_as_float(__float_as_int(a.w) ^ b)
     );
 }
 

diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu
@@ -286,11 +286,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 }
 
 
+#ifdef XMRIG_ALGO_CN_GPU
 template<xmrig_cuda::Algorithm::Id ALGO>
 __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
 {
-    using namespace xmrig_cuda;
-
     const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
     __shared__ uint32_t sharedMemory[1024];
@@ -334,6 +333,7 @@ __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, u
             d_res_nonce[idx] = thread;
     }
 }
+#endif
 
 void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len)
 {
@@ -481,13 +481,13 @@ void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t ta
 
     if (algorithm.family() == Algorithm::CN_HEAVY) {
         CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_HEAVY_0><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ));
+#   ifdef XMRIG_ALGO_CN_GPU
+    } else if (algorithm == Algorithm::CN_GPU) {
+        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_gpu_extra_gpu_final<Algorithm::CN_GPU> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
+#   endif
     } else {
-        if (algorithm == Algorithm::CN_GPU) {
-            CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_gpu_extra_gpu_final<Algorithm::CN_GPU> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
-        } else {
-            // fallback for all other algorithms
-            CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_0> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
-        }
+        // fallback for all other algorithms
+        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_0> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
     }
 
     CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());
@@ -620,7 +620,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
         // Leave memory for 2080 MB dataset + 64 MB free
         // Each thread uses 1 scratchpad plus a few small buffers on GPU
         const size_t dataset_size = 2080u << 20;
-        const size_t max_blocks = (freeMemory - (ctx->rx_dataset_host ? 0 : dataset_size) - (64u << 20)) / (ctx->algorithm.l3() + 32768) / 32;
+        const int max_blocks = (freeMemory - (ctx->rx_dataset_host ? 0 : dataset_size) - (64u << 20)) / (ctx->algorithm.l3() + 32768) / 32;
         if (ctx->device_blocks > max_blocks) {
             ctx->device_blocks = max_blocks;
         }
@@ -730,6 +730,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
             }
         }
 
+#       ifdef XMRIG_ALGO_CN_GPU
         if (ctx->algorithm == Algorithm::CN_GPU && props.major < 7) {
             int t = 32;
             int b = ctx->device_blocks;
@@ -746,6 +747,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
                 ctx->device_blocks = b;
             }
         }
+#       endif
 
         ctx->device_threads = std::min(ctx->device_threads, (props.major == 2 ? 64 : 128));
     }

diff --git a/src/xmrig-cuda.cpp b/src/xmrig-cuda.cpp
@@ -200,14 +200,14 @@ bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *resco
             RandomX_Keva::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
-        case Algorithm::RX_XLA:
-            throw std::runtime_error(kUnsupportedAlgorithm);
-            //RandomX_DefyX::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
-
         case Algorithm::RX_GRAFT:
             RandomX_Graft::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
+        case Algorithm::RX_XLA:
+            //RandomX_Panthera::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+            //break;
+
         default:
             throw std::runtime_error(kUnsupportedAlgorithm);
         }
@@ -231,11 +231,7 @@ bool rxPrepare(nvid_ctx *ctx, const void *dataset, size_t datasetSize, bool, uin
     resetError(ctx->device_id);
 
     try {
-#       ifdef XMRIG_ALGO_RANDOMX
         randomx_prepare(ctx, ctx->rx_dataset_host > 0 ? datasetHost.reg(dataset, datasetSize) : dataset, datasetSize, batchSize);
-#       else
-        throw std::runtime_error(kUnsupportedAlgorithm);
-#       endif
     }
     catch (std::exception &ex) {
         return saveError(ctx->device_id, ex);
@@ -284,11 +280,7 @@ bool astroBWTPrepare(nvid_ctx *ctx, uint32_t batchSize)
     resetError(ctx->device_id);
 
     try {
-#       ifdef XMRIG_ALGO_ASTROBWT
         astrobwt_prepare(ctx, batchSize);
-#       else
-        throw std::runtime_error(kUnsupportedAlgorithm);
-#       endif
     }
     catch (std::exception &ex) {
         return saveError(ctx->device_id, ex);
@@ -337,11 +329,7 @@ bool kawPowPrepare_v2(nvid_ctx *ctx, const void* cache, size_t cache_size, const
     resetError(ctx->device_id);
 
     try {
-#       ifdef XMRIG_ALGO_KAWPOW
         kawpow_prepare(ctx, cache, cache_size, dag_precalc, dag_size, height, dag_sizes);
-#       else
-        throw std::runtime_error(kUnsupportedAlgorithm);
-#       endif
     }
     catch (std::exception &ex) {
         return saveError(ctx->device_id, ex);
@@ -360,11 +348,7 @@ bool kawPowStopHash(nvid_ctx *ctx)
 
 #   ifdef XMRIG_ALGO_KAWPOW
     try {
-#       ifdef XMRIG_ALGO_KAWPOW
         kawpow_stop_hash(ctx);
-#       else
-        throw std::runtime_error(kUnsupportedAlgorithm);
-#       endif
     }
     catch (std::exception &ex) {
         return saveError(ctx->device_id, ex);
@@ -611,7 +595,7 @@ void release(nvid_ctx *ctx)
     cudaFree(ctx->astrobwt_offsets_begin);
     cudaFree(ctx->astrobwt_offsets_end);
 
-#   ifdef WITH_KAWPOW
+#   ifdef XMRIG_ALGO_KAWPOW
     cudaFree(ctx->kawpow_cache);
     cudaFree(ctx->kawpow_dag);
     cudaFreeHost(ctx->kawpow_stop_host);