Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update patchset to reduce differences to upstream #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Main reasons why this plugin is separated project is:

## Windows usage

* [Download](https://github.com/xmrig/xmrig-cuda/releases) plugin, you must choose CUDA version, usually it recent version (CUDA 10.1), but builds with older CUDA version also provided, alternative you can build the plugin from source (requiring GCC 9).
* [Download](https://github.com/MoneroOcean/xmrig-cuda/releases) plugin, you must choose CUDA version, usually it recent version (CUDA 10.1), but builds with older CUDA version also provided, alternative you can build the plugin from source (requiring GCC 9).
* Place **`xmrig-cuda.dll`** and other dll files near to **`xmrig.exe`**.
* Edit **`config.json`** to enable CUDA support.
```
Expand Down
10 changes: 7 additions & 3 deletions src/RandomX/hash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,13 @@ void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, ui
// CUDA_CHECK_KERNEL(ctx->device_id, sipesh<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
// k12(input, inputSize, tempHash);
// CUDA_CHECK_KERNEL(ctx->device_id, k12<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
} else {
CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
}
//=======
// rx_blake2b_wrapper::run(out, outlen, in, inlen);
// yespower_params_t params = { YESPOWER_1_0, 2048, 8, NULL };
// if (yespower_tls((const uint8_t *)out, outlen, &params, (yespower_binary_t *)out)) return -1;
// return KangarooTwelve((const unsigned char *)out, outlen, (unsigned char *)out, 32, 0, 0);
} else
CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash<<<batch_size / 32, 32>>>(ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));
CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_rx_rounding, 0, batch_size * sizeof(uint32_t)));

Expand Down
2 changes: 2 additions & 0 deletions src/RandomX/randomx_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,14 @@ __device__ void print_inst(uint2 inst)
}
opcode -= RANDOMX_FREQ_IROR_R;

#if RANDOMX_FREQ_IROL_R > 0
if (opcode < RANDOMX_FREQ_IROL_R)
{
printf("%s%sIROL_R r%u, r%u ", branch_target, fp_inst, dst, src);
break;
}
opcode -= RANDOMX_FREQ_IROL_R;
#endif

if (opcode < RANDOMX_FREQ_ISWAP_R)
{
Expand Down
4 changes: 4 additions & 0 deletions src/crypto/cn/CnAlgo.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,16 @@ template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::iterations()
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_TLO>::iterations() const { return CN_ITER / 8; }
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_CCX>::iterations() const { return CN_ITER / 2; }
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() const { return CN_ITER / 32; }
#ifdef XMRIG_ALGO_CN_GPU
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::iterations() const { return 0xC000; }
#endif


template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const { return 0x1FFF0; }
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const { return 0x1FFF0; }
#ifdef XMRIG_ALGO_CN_GPU
template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::mask() const { return 0x1FFFC0; }
#endif


} /* namespace xmrig_cuda */
Expand Down
2 changes: 1 addition & 1 deletion src/crypto/common/Algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
CN_GPU,
# endif
# ifdef XMRIG_ALGO_RANDOMX
RX_XLA,
RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA,
RX_XLA,
# endif
# ifdef XMRIG_ALGO_ARGON2
AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,
Expand Down
3 changes: 1 addition & 2 deletions src/crypto/common/Algorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,12 @@ class Algorithm
RX_GRAFT = 0x72151267, // "rx/graft" RandomGRAFT (Graft).
RX_SFX = 0x72151273, // "rx/sfx" RandomSFX (Safex Cash).
RX_KEVA = 0x7214116b, // "rx/keva" RandomKEVA (Keva).
RX_XLA = 0x721211ff, // "panthera" Panthera (Scala2).
AR2_CHUKWA = 0x61130000, // "argon2/chukwa" Argon2id (Chukwa).
AR2_CHUKWA_V2 = 0x61140000, // "argon2/chukwav2" Argon2id (Chukwa v2).
AR2_WRKZ = 0x61120000, // "argon2/wrkz" Argon2id (WRKZ)
ASTROBWT_DERO = 0x41000000, // "astrobwt" AstroBWT (Dero)
KAWPOW_RVN = 0x6b0f0000, // "kawpow/rvn" KawPow (RVN)

RX_XLA = 0x721211ff, // "panthera" Panthera (Scala2).
};

enum Family : uint32_t {
Expand Down
2 changes: 1 addition & 1 deletion src/cryptonight.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ namespace RandomX_Arqma { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t ta
namespace RandomX_Monero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
namespace RandomX_Keva { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
namespace RandomX_DefyX { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
namespace RandomX_Graft { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
namespace RandomX_DefyX { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }

void astrobwt_prepare(nvid_ctx *ctx, uint32_t batch_size);

Expand Down
18 changes: 13 additions & 5 deletions src/cuda_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ static inline void compat_usleep(int waitTime)
#include "cuda_device.hpp"
#include "cuda_fast_int_math_v2.hpp"
#include "cuda_fast_div_heavy.hpp"
#ifdef XMRIG_ALGO_CN_GPU
#include "cuda_cryptonight_gpu.hpp"
#endif

#if defined(__x86_64__) || defined(_M_AMD64) || defined(__LP64__)
# define _ASM_PTR_ "l"
Expand Down Expand Up @@ -508,7 +510,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(

float conc_var;
if (ALGO == Algorithm::CN_CCX) {
conc_var = (partidx != 0) ? int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)) : 0.0f;
conc_var = (partidx != 0) ? __int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)) : 0.0f;
}

#pragma unroll 2
Expand Down Expand Up @@ -546,9 +548,9 @@ __global__ void cryptonight_core_gpu_phase2_quad(
uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);

if (ALGO == Algorithm::CN_CCX) {
float r = int2float((int32_t)x_0) + conc_var;
r = int_as_float((float_as_int(r * r * r) & 0x807FFFFF) | 0x40000000);
x_0 ^= (int32_t)(int_as_float((float_as_int(conc_var) & 0x807FFFFF) | 0x40000000) * 536870880.0f);
float r = __int2float_rn((int32_t)x_0) + conc_var;
r = __int_as_float((__float_as_int(r * r * r) & 0x807FFFFF) | 0x40000000);
x_0 ^= (int32_t)(__int_as_float((__float_as_int(conc_var) & 0x807FFFFF) | 0x40000000) * 536870880.0f);
conc_var += r;
}

Expand Down Expand Up @@ -639,7 +641,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(
}
}
if (ALGO == Algorithm::CN_CCX) {
*(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
*(d_ctx_b + threads * 4 + thread * 4 + sub) = __float_as_int(conc_var);
}
}
}
Expand Down Expand Up @@ -818,6 +820,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
}


#ifdef XMRIG_ALGO_CN_GPU
template<xmrig_cuda::Algorithm::Id ALGO>
void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
{
Expand Down Expand Up @@ -883,6 +886,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
ctx->d_ctx_state, ctx->d_ctx_key2 ));
}
}
#endif

void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm, uint64_t height, uint32_t startNonce)
{
Expand Down Expand Up @@ -962,9 +966,11 @@ void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm,
cryptonight_core_gpu_hash<Algorithm::CN_CCX>(ctx, startNonce);
break;

# ifdef XMRIG_ALGO_CN_GPU
case Algorithm::CN_GPU:
cryptonight_core_gpu_hash_gpu<Algorithm::CN_GPU>(ctx, startNonce);
break;
# endif

default:
break;
Expand All @@ -980,9 +986,11 @@ void cryptonight_gpu_hash(nvid_ctx *ctx, const xmrig_cuda::Algorithm &algorithm,
cryptonight_core_gpu_hash<Algorithm::CN_LITE_1>(ctx, startNonce);
break;

# ifdef XMRIG_ALGO_CN_GPU
case Algorithm::CN_GPU:
cryptonight_core_gpu_hash_gpu<Algorithm::CN_GPU>(ctx, startNonce);
break;
# endif

default:
break;
Expand Down
32 changes: 16 additions & 16 deletions src/cuda_cryptonight_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ struct __m128 : public float4

__forceinline__ __device__ __m128( const __m128i& x0)
{
float4::x = int2float(x0.x);
float4::y = int2float(x0.y);
float4::z = int2float(x0.z);
float4::w = int2float(x0.w);
float4::x = __int2float_rn(x0.x);
float4::y = __int2float_rn(x0.y);
float4::z = __int2float_rn(x0.z);
float4::w = __int2float_rn(x0.w);
}

__forceinline__ __device__ __m128i get_int( )
Expand Down Expand Up @@ -217,30 +217,30 @@ __forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b)
__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
{
return __m128(
int_as_float(float_as_int(a.x) & b),
int_as_float(float_as_int(a.y) & b),
int_as_float(float_as_int(a.z) & b),
int_as_float(float_as_int(a.w) & b)
__int_as_float(__float_as_int(a.x) & b),
__int_as_float(__float_as_int(a.y) & b),
__int_as_float(__float_as_int(a.z) & b),
__int_as_float(__float_as_int(a.w) & b)
);
}

__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
{
return __m128(
int_as_float(float_as_int(a.x) | b),
int_as_float(float_as_int(a.y) | b),
int_as_float(float_as_int(a.z) | b),
int_as_float(float_as_int(a.w) | b)
__int_as_float(__float_as_int(a.x) | b),
__int_as_float(__float_as_int(a.y) | b),
__int_as_float(__float_as_int(a.z) | b),
__int_as_float(__float_as_int(a.w) | b)
);
}

__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
{
return __m128(
int_as_float(float_as_int(a.x) ^ b),
int_as_float(float_as_int(a.y) ^ b),
int_as_float(float_as_int(a.z) ^ b),
int_as_float(float_as_int(a.w) ^ b)
__int_as_float(__float_as_int(a.x) ^ b),
__int_as_float(__float_as_int(a.y) ^ b),
__int_as_float(__float_as_int(a.z) ^ b),
__int_as_float(__float_as_int(a.w) ^ b)
);
}

Expand Down
20 changes: 11 additions & 9 deletions src/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
}


#ifdef XMRIG_ALGO_CN_GPU
template<xmrig_cuda::Algorithm::Id ALGO>
__global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
{
using namespace xmrig_cuda;

const int thread = blockDim.x * blockIdx.x + threadIdx.x;

__shared__ uint32_t sharedMemory[1024];
Expand Down Expand Up @@ -334,6 +333,7 @@ __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, u
d_res_nonce[idx] = thread;
}
}
#endif

void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len)
{
Expand Down Expand Up @@ -481,13 +481,13 @@ void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t ta

if (algorithm.family() == Algorithm::CN_HEAVY) {
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_HEAVY_0><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ));
# ifdef XMRIG_ALGO_CN_GPU
} else if (algorithm == Algorithm::CN_GPU) {
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_gpu_extra_gpu_final<Algorithm::CN_GPU> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
# endif
} else {
if (algorithm == Algorithm::CN_GPU) {
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_gpu_extra_gpu_final<Algorithm::CN_GPU> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
} else {
// fallback for all other algorithms
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_0> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
}
// fallback for all other algorithms
CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<Algorithm::CN_0> << <grid, block >> > (wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
}

CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());
Expand Down Expand Up @@ -620,7 +620,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
// Leave memory for 2080 MB dataset + 64 MB free
// Each thread uses 1 scratchpad plus a few small buffers on GPU
const size_t dataset_size = 2080u << 20;
const size_t max_blocks = (freeMemory - (ctx->rx_dataset_host ? 0 : dataset_size) - (64u << 20)) / (ctx->algorithm.l3() + 32768) / 32;
const int max_blocks = (freeMemory - (ctx->rx_dataset_host ? 0 : dataset_size) - (64u << 20)) / (ctx->algorithm.l3() + 32768) / 32;
if (ctx->device_blocks > max_blocks) {
ctx->device_blocks = max_blocks;
}
Expand Down Expand Up @@ -730,6 +730,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
}
}

# ifdef XMRIG_ALGO_CN_GPU
if (ctx->algorithm == Algorithm::CN_GPU && props.major < 7) {
int t = 32;
int b = ctx->device_blocks;
Expand All @@ -746,6 +747,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
ctx->device_blocks = b;
}
}
# endif

ctx->device_threads = std::min(ctx->device_threads, (props.major == 2 ? 64 : 128));
}
Expand Down
26 changes: 5 additions & 21 deletions src/xmrig-cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,14 @@ bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *resco
RandomX_Keva::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
break;

case Algorithm::RX_XLA:
throw std::runtime_error(kUnsupportedAlgorithm);
//RandomX_DefyX::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);

case Algorithm::RX_GRAFT:
RandomX_Graft::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
break;

case Algorithm::RX_XLA:
//RandomX_Panthera::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
//break;

default:
throw std::runtime_error(kUnsupportedAlgorithm);
}
Expand All @@ -231,11 +231,7 @@ bool rxPrepare(nvid_ctx *ctx, const void *dataset, size_t datasetSize, bool, uin
resetError(ctx->device_id);

try {
# ifdef XMRIG_ALGO_RANDOMX
randomx_prepare(ctx, ctx->rx_dataset_host > 0 ? datasetHost.reg(dataset, datasetSize) : dataset, datasetSize, batchSize);
# else
throw std::runtime_error(kUnsupportedAlgorithm);
# endif
}
catch (std::exception &ex) {
return saveError(ctx->device_id, ex);
Expand Down Expand Up @@ -284,11 +280,7 @@ bool astroBWTPrepare(nvid_ctx *ctx, uint32_t batchSize)
resetError(ctx->device_id);

try {
# ifdef XMRIG_ALGO_ASTROBWT
astrobwt_prepare(ctx, batchSize);
# else
throw std::runtime_error(kUnsupportedAlgorithm);
# endif
}
catch (std::exception &ex) {
return saveError(ctx->device_id, ex);
Expand Down Expand Up @@ -337,11 +329,7 @@ bool kawPowPrepare_v2(nvid_ctx *ctx, const void* cache, size_t cache_size, const
resetError(ctx->device_id);

try {
# ifdef XMRIG_ALGO_KAWPOW
kawpow_prepare(ctx, cache, cache_size, dag_precalc, dag_size, height, dag_sizes);
# else
throw std::runtime_error(kUnsupportedAlgorithm);
# endif
}
catch (std::exception &ex) {
return saveError(ctx->device_id, ex);
Expand All @@ -360,11 +348,7 @@ bool kawPowStopHash(nvid_ctx *ctx)

# ifdef XMRIG_ALGO_KAWPOW
try {
# ifdef XMRIG_ALGO_KAWPOW
kawpow_stop_hash(ctx);
# else
throw std::runtime_error(kUnsupportedAlgorithm);
# endif
}
catch (std::exception &ex) {
return saveError(ctx->device_id, ex);
Expand Down Expand Up @@ -611,7 +595,7 @@ void release(nvid_ctx *ctx)
cudaFree(ctx->astrobwt_offsets_begin);
cudaFree(ctx->astrobwt_offsets_end);

# ifdef WITH_KAWPOW
# ifdef XMRIG_ALGO_KAWPOW
cudaFree(ctx->kawpow_cache);
cudaFree(ctx->kawpow_dag);
cudaFreeHost(ctx->kawpow_stop_host);
Expand Down