Merge xmrig-cuda v6.22.0 into master

MoneroOcean · Aug 12, 2024 · beb3cd0 · beb3cd0
2 parents e821970 + 01d5348
commit beb3cd0
Show file tree

Hide file tree

Showing 11 changed files with 289 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+# v6.22.0
+- [#201](https://github.com/xmrig/xmrig-cuda/pull/201) Added support for [Yada](https://yadacoin.io/) (`rx/yada` algorithm).
+
 # v6.21.1
 - The binary downloads now only support the latest version of each major CUDA release.
 - Improved build speed with CUDA 11.3 or higher.

diff --git a/cmake/CUDA.cmake b/cmake/CUDA.cmake
@@ -57,6 +57,7 @@ if (NOT CUDA_VERSION VERSION_LESS 11.5)
 endif()
 
 if (NOT CUDA_VERSION VERSION_LESS 11.8)
+    list(APPEND DEFAULT_CUDA_ARCH "89")
     list(APPEND DEFAULT_CUDA_ARCH "90")
 endif()
 list(SORT DEFAULT_CUDA_ARCH)
@@ -236,6 +237,8 @@ if (WITH_RANDOMX)
         src/RandomX/randomx.cu
         src/RandomX/wownero/configuration.h
         src/RandomX/wownero/randomx_wownero.cu
+        src/RandomX/yada/configuration.h
+        src/RandomX/yada/randomx_yada.cu
     )
 else()
     set(CUDA_RANDOMX_SOURCES "")

diff --git a/src/RandomX/blake2b_cuda.hpp b/src/RandomX/blake2b_cuda.hpp
@@ -205,6 +205,90 @@ __device__ void blake2b_512_process_double_block(uint64_t *out, uint64_t* m, con
 	if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
 }
 
+template<uint32_t out_len>
+__device__ void blake2b_512_process_big_block(uint64_t* out, const uint64_t* in, uint32_t in_len, uint32_t nonce, uint32_t nonce_offset)
+{
+	uint64_t h[8] = { Blake2b_IV::iv0 ^ (0x01010000u | out_len), Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
+
+	for (uint32_t t = 128; t < in_len; t += 128, in += 16) {
+		uint64_t m[16] = { in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in[8], in[9], in[10], in[11],  in[12], in[13], in[14], in[15] };
+
+		const uint32_t k0 = (nonce_offset + 0) - (t - 128);
+		const uint32_t k1 = (nonce_offset + 1) - (t - 128);
+		const uint32_t k2 = (nonce_offset + 2) - (t - 128);
+		const uint32_t k3 = (nonce_offset + 3) - (t - 128);
+
+		if (k0 < 128) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
+		if (k1 < 128) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
+		if (k2 < 128) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
+		if (k3 < 128) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
+
+		uint64_t v[16] = { h[0],  h[1],  h[2],  h[3],  h[4],  h[5],  h[6],  h[7],   Blake2b_IV::iv0,   Blake2b_IV::iv1,    Blake2b_IV::iv2,    Blake2b_IV::iv3, Blake2b_IV::iv4 ^ t,    Blake2b_IV::iv5,    Blake2b_IV::iv6,    Blake2b_IV::iv7 };
+
+		BLAKE2B_ROUNDS();
+
+		h[0] ^= v[0] ^ v[8];
+		h[1] ^= v[1] ^ v[9];
+		h[2] ^= v[2] ^ v[10];
+		h[3] ^= v[3] ^ v[11];
+		h[4] ^= v[4] ^ v[12];
+		h[5] ^= v[5] ^ v[13];
+		h[6] ^= v[6] ^ v[14];
+		h[7] ^= v[7] ^ v[15];
+	}
+
+	uint32_t k = in_len & 127;
+	if (k == 0) k = 128;
+
+	uint64_t m[16] = {
+		(k > 0) ? in[0] : 0,
+		(k > 8) ? in[1] : 0,
+		(k > 16) ? in[2] : 0,
+		(k > 24) ? in[3] : 0,
+		(k > 32) ? in[4] : 0,
+		(k > 40) ? in[5] : 0,
+		(k > 48) ? in[6] : 0,
+		(k > 56) ? in[7] : 0,
+		(k > 64) ? in[8] : 0,
+		(k > 72) ? in[9] : 0,
+		(k > 80) ? in[10] : 0,
+		(k > 88) ? in[11] : 0,
+		(k > 96) ? in[12] : 0,
+		(k > 104) ? in[13] : 0,
+		(k > 112) ? in[14] : 0,
+		(k > 120) ? in[15] : 0
+	};
+
+	const uint32_t t = in_len - k;
+
+	const uint32_t k0 = nonce_offset + 0 - t;
+	const uint32_t k1 = nonce_offset + 1 - t;
+	const uint32_t k2 = nonce_offset + 2 - t;
+	const uint32_t k3 = nonce_offset + 3 - t;
+
+	if (k0 < k) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
+	if (k1 < k) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
+	if (k2 < k) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
+	if (k3 < k) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
+
+	if (k % 8) {
+		m[k / 8] &= (uint64_t)(-1) >> (64 - (k % 8) * 8);
+	}
+
+	uint64_t v[16] = { h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ in_len, Blake2b_IV::iv5, ~Blake2b_IV::iv6, Blake2b_IV::iv7 };
+
+	BLAKE2B_ROUNDS();
+
+	if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8];
+	if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9];
+	if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10];
+	if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11];
+	if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12];
+	if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13];
+	if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14];
+	if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
+}
+
 #undef G
 #undef ROUND
 #undef BLAKE2B_ROUNDS
@@ -280,6 +364,25 @@ __global__ void blake2b_initial_hash_double(void* out, const void* blockTemplate
 	t[7] = hash[7];
 }
 
+__global__ void blake2b_initial_hash_big(void* out, const void* blockTemplate, uint32_t blockTemplateSize, uint32_t start_nonce, uint32_t nonce_offset)
+{
+	const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x;
+	const uint64_t* p = (const uint64_t*)blockTemplate;
+
+	uint64_t hash[8];
+	blake2b_512_process_big_block<64>(hash, p, blockTemplateSize, start_nonce + global_index, nonce_offset);
+
+	uint64_t* t = ((uint64_t*) out) + global_index * 8;
+	t[0] = hash[0];
+	t[1] = hash[1];
+	t[2] = hash[2];
+	t[3] = hash[3];
+	t[4] = hash[4];
+	t[5] = hash[5];
+	t[6] = hash[6];
+	t[7] = hash[7];
+}
+
 template<uint32_t registers_len, uint32_t registers_stride, uint32_t out_len>
 __global__ void blake2b_hash_registers(void *out, const void* in)
 {

diff --git a/src/RandomX/hash.hpp b/src/RandomX/hash.hpp
@@ -32,7 +32,7 @@ __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* share
     }
 }
 
-void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
+void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
 {
     if (ctx->algorithm.id() == xmrig_cuda::Algorithm::RX_XLA) {
 //        sipesh(tempHash, sizeof(tempHash), input, inputSize, input, inputSize, 0, 0);
@@ -46,8 +46,7 @@ void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, ui
         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
     }
     else {
-        *rescount = 0;
-        return;
+        CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset));
     }
 
     CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));

diff --git a/src/RandomX/yada/configuration.h b/src/RandomX/yada/configuration.h
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2018-2019, tevador <[email protected]>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+//Cache size in KiB. Must be a power of 2.
+#define RANDOMX_ARGON_MEMORY       262144
+
+//Number of Argon2d iterations for Cache initialization.
+#define RANDOMX_ARGON_ITERATIONS   4
+
+//Number of parallel lanes for Cache initialization.
+#define RANDOMX_ARGON_LANES        1
+
+//Argon2d salt
+#define RANDOMX_ARGON_SALT         "RandomXYadaCoin\x03"
+
+//Number of random Cache accesses per Dataset item. Minimum is 2.
+#define RANDOMX_CACHE_ACCESSES     8
+
+//Target latency for SuperscalarHash (in cycles of the reference CPU).
+#define RANDOMX_SUPERSCALAR_LATENCY   150
+
+//Dataset base size in bytes. Must be a power of 2.
+#define RANDOMX_DATASET_BASE_SIZE  2147483648
+
+//Dataset extra size. Must be divisible by 64.
+#define RANDOMX_DATASET_EXTRA_SIZE 33554368
+
+//Number of instructions in a RandomX program. Must be divisible by 8.
+#define RANDOMX_PROGRAM_SIZE       256
+
+//Number of iterations during VM execution.
+#define RANDOMX_PROGRAM_ITERATIONS 2048
+
+//Number of chained VM executions per hash.
+#define RANDOMX_PROGRAM_COUNT      8
+
+//Scratchpad L3 size in bytes. Must be a power of 2.
+#define RANDOMX_SCRATCHPAD_L3      2097152
+
+//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
+#define RANDOMX_SCRATCHPAD_L2      262144
+
+//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
+#define RANDOMX_SCRATCHPAD_L1      16384
+
+//Jump condition mask size in bits.
+#define RANDOMX_JUMP_BITS          8
+
+//Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
+#define RANDOMX_JUMP_OFFSET        8
+
+/*
+Instruction frequencies (per 256 opcodes)
+Total sum of frequencies must be 256
+*/
+
+//Integer instructions
+#define RANDOMX_FREQ_IADD_RS       16
+#define RANDOMX_FREQ_IADD_M         7
+#define RANDOMX_FREQ_ISUB_R        16
+#define RANDOMX_FREQ_ISUB_M         7
+#define RANDOMX_FREQ_IMUL_R        16
+#define RANDOMX_FREQ_IMUL_M         4
+#define RANDOMX_FREQ_IMULH_R        4
+#define RANDOMX_FREQ_IMULH_M        1
+#define RANDOMX_FREQ_ISMULH_R       4
+#define RANDOMX_FREQ_ISMULH_M       1
+#define RANDOMX_FREQ_IMUL_RCP       8
+#define RANDOMX_FREQ_INEG_R         2
+#define RANDOMX_FREQ_IXOR_R        15
+#define RANDOMX_FREQ_IXOR_M         5
+#define RANDOMX_FREQ_IROR_R         8
+#define RANDOMX_FREQ_IROL_R         2
+#define RANDOMX_FREQ_ISWAP_R        4
+
+//Floating point instructions
+#define RANDOMX_FREQ_FSWAP_R        4
+#define RANDOMX_FREQ_FADD_R        16
+#define RANDOMX_FREQ_FADD_M         5
+#define RANDOMX_FREQ_FSUB_R        16
+#define RANDOMX_FREQ_FSUB_M         5
+#define RANDOMX_FREQ_FSCAL_R        6
+#define RANDOMX_FREQ_FMUL_R        32
+#define RANDOMX_FREQ_FDIV_M         4
+#define RANDOMX_FREQ_FSQRT_R        6
+
+//Control instructions
+#define RANDOMX_FREQ_CBRANCH       25
+#define RANDOMX_FREQ_CFROUND        1
+
+//Store instruction
+#define RANDOMX_FREQ_ISTORE        16
+
+//No-op instruction
+#define RANDOMX_FREQ_NOP            0
+/*                               ------
+                                  256
+*/
diff --git a/src/RandomX/yada/randomx_yada.cu b/src/RandomX/yada/randomx_yada.cu
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2019-2020 SChernykh
+
+This file is part of RandomX CUDA.
+
+RandomX CUDA is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX CUDA is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "cryptonight.h"
+#include "cuda_device.hpp"
+
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cstdint>
+
+
+namespace RandomX_Yada {
+    #include "configuration.h"
+    #define fillAes4Rx4 fillAes4Rx4_v104
+    #include "RandomX/common.hpp"
+}
diff --git a/src/crypto/common/Algorithm.cpp b/src/crypto/common/Algorithm.cpp
@@ -47,7 +47,7 @@ xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
 #       endif
 #       ifdef XMRIG_ALGO_RANDOMX
         RX_XLA,
-        RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA,
+        RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA, RX_YADA,
 #       endif
 #       ifdef XMRIG_ALGO_ARGON2
         AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,

diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h
@@ -61,6 +61,7 @@ class Algorithm
         RX_GRAFT        = 0x72151267,   // "rx/graft"         RandomGRAFT (Graft).
         RX_SFX          = 0x72151273,   // "rx/sfx"           RandomSFX (Safex Cash).
         RX_KEVA         = 0x7214116b,   // "rx/keva"          RandomKEVA (Keva).
+        RX_YADA         = 0x72151279,   // "rx/yada"          RandomYada (YadaCoin).
         AR2_CHUKWA      = 0x61130000,   // "argon2/chukwa"    Argon2id (Chukwa).
         AR2_CHUKWA_V2   = 0x61140000,   // "argon2/chukwav2"  Argon2id (Chukwa v2).
         AR2_WRKZ        = 0x61120000,   // "argon2/wrkz"      Argon2id (WRKZ)

diff --git a/src/cryptonight.h b/src/cryptonight.h
@@ -121,12 +121,12 @@ void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t ta
 void cuda_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len);
 void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size);
 
-namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Keva    { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_DefyX   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Keva    { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Yada    { void hash(nvid_ctx* ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t* rescount, uint32_t* resnonce, uint32_t batch_size); }
 
 #ifdef XMRIG_ALGO_KAWPOW
 void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes);

diff --git a/src/version.h b/src/version.h
@@ -22,14 +22,14 @@
 #define APP_ID        "xmrig-cuda"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig CUDA plugin"
-#define APP_VERSION   "6.21.1-mo1"
+#define APP_VERSION   "6.22.0-mo1"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2024 xmrig.com"
 
 #define APP_VER_MAJOR  6
-#define APP_VER_MINOR  21
-#define APP_VER_PATCH  1
+#define APP_VER_MINOR  22
+#define APP_VER_PATCH  0
 
 #define API_VERSION    4