Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use MCG59 engine on GPU device #1423

Merged
merged 5 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/conda-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,6 @@ jobs:
run: |
python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
working-directory: ${{ env.tests-path }}
env:
SYCL_QUEUE_THREAD_POOL_SIZE: 6

test_windows:
name: Test ['windows-latest', python='${{ matrix.python }}']
Expand Down Expand Up @@ -335,8 +333,6 @@ jobs:
run: |
python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
working-directory: ${{ env.tests-path }}
env:
SYCL_QUEUE_THREAD_POOL_SIZE: 6

upload:
name: Upload ['${{ matrix.os }}', python='${{ matrix.python }}']
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/generate_coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_PARALLEL: true
SYCL_QUEUE_THREAD_POOL_SIZE: 6

coveralls:
name: Indicate completion to coveralls.io
Expand Down
6 changes: 3 additions & 3 deletions dpnp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
import os
mypath = os.path.dirname(os.path.realpath(__file__))

# workaround against hanging in OneMKL calls and in DPCTL
os.environ.setdefault('SYCL_QUEUE_THREAD_POOL_SIZE', '6')

import dpctl
dpctlpath = os.path.dirname(dpctl.__file__)

Expand All @@ -41,9 +44,6 @@
os.add_dll_directory(dpctlpath)
os.environ["PATH"] = os.pathsep.join([os.getenv("PATH", ""), mypath, dpctlpath])

# workaround against hanging in OneMKL calls
os.environ.setdefault('SYCL_QUEUE_THREAD_POOL_SIZE', '6')

from dpnp.dpnp_array import dpnp_array as ndarray
from dpnp.dpnp_flatiter import flatiter as flatiter

Expand Down
235 changes: 158 additions & 77 deletions dpnp/backend/kernels/dpnp_krnl_random.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,24 +77,78 @@ void dpnp_rng_srand_c(size_t seed)
}

template <typename _DistrType, typename _EngineType, typename _DataType>
static inline DPCTLSyclEventRef dpnp_rng_generate(const _DistrType& distr,
_EngineType& engine,
const int64_t size,
_DataType* result) {
static inline DPCTLSyclEventRef
dpnp_rng_generate(const _DistrType& distr, _EngineType& engine, const int64_t size, _DataType* result)
{
DPCTLSyclEventRef event_ref = nullptr;
sycl::event event;

// perform rng generation
try {
try
{
event = mkl_rng::generate<_DistrType, _EngineType>(distr, engine, size, result);
event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
} catch (const std::exception &e) {
}
catch (const std::exception& e)
{
// TODO: add error reporting
return event_ref;
}
return DPCTLEvent_Copy(event_ref);
}

template <typename _EngineType, typename _DataType>
static inline DPCTLSyclEventRef dpnp_rng_generate_uniform(
_EngineType& engine, sycl::queue* q, const _DataType a, const _DataType b, const int64_t size, _DataType* result)
{
DPCTLSyclEventRef event_ref = nullptr;

if constexpr (std::is_same<_DataType, int32_t>::value)
{
if (q->get_device().has(sycl::aspect::fp64))
{
/**
* A note from oneMKL for oneapi::mkl::rng::uniform (Discrete):
* The oneapi::mkl::rng::uniform_method::standard uses the s BRNG type on GPU devices.
* This might cause the produced numbers to have incorrect statistics (due to rounding error)
* when abs(b-a) > 2^23 || abs(b) > 2^23 || abs(a) > 2^23. To get proper statistics for this case,
* use the oneapi::mkl::rng::uniform_method::accurate method instead.
*/
using method_type = mkl_rng::uniform_method::accurate;
mkl_rng::uniform<_DataType, method_type> distribution(a, b);

// perform generation
try
{
sycl::event event = mkl_rng::generate(distribution, engine, size, result);

event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
return DPCTLEvent_Copy(event_ref);
}
catch (const oneapi::mkl::unsupported_device&)
{
// fall through to try with uniform_method::standard
}
catch (const oneapi::mkl::unimplemented&)
{
// fall through to try with uniform_method::standard
}
catch (const std::exception& e)
{
// TODO: add error reporting
return event_ref;
}
}
}

// uniform_method::standard is a method used by default
using method_type = mkl_rng::uniform_method::standard;
mkl_rng::uniform<_DataType, method_type> distribution(a, b);

// perform generation
return dpnp_rng_generate(distribution, engine, size, result);
}

template <typename _DataType>
DPCTLSyclEventRef dpnp_rng_beta_c(DPCTLSyclQueueRef q_ref,
void* result,
Expand Down Expand Up @@ -1392,49 +1446,75 @@ DPCTLSyclEventRef dpnp_rng_normal_c(DPCTLSyclQueueRef q_ref,
{
// avoid warning unused variable
(void)dep_event_vec_ref;
(void)q_ref;

DPCTLSyclEventRef event_ref = nullptr;
sycl::queue* q = reinterpret_cast<sycl::queue*>(q_ref);

if (!size)
{
return event_ref;
}
assert(q != nullptr);

mt19937_struct* random_state = static_cast<mt19937_struct *>(random_state_in);
_DataType* result = static_cast<_DataType *>(result_out);
_DataType* result = static_cast<_DataType*>(result_out);

// set mean of distribution
const _DataType mean = static_cast<_DataType>(mean_in);
// set standard deviation of distribution
const _DataType stddev = static_cast<_DataType>(stddev_in);

mkl_rng::gaussian<_DataType> distribution(mean, stddev);
mkl_rng::mt19937 *engine = static_cast<mkl_rng::mt19937 *>(random_state->engine);

// perform generation
return dpnp_rng_generate<mkl_rng::gaussian<_DataType>, mkl_rng::mt19937, _DataType>(
distribution, *engine, size, result);
if (q->get_device().is_cpu())
{
mt19937_struct* random_state = static_cast<mt19937_struct*>(random_state_in);
mkl_rng::mt19937* engine = static_cast<mkl_rng::mt19937*>(random_state->engine);

// perform generation with MT19937 engine
event_ref = dpnp_rng_generate(distribution, *engine, size, result);
}
else
{
mcg59_struct* random_state = static_cast<mcg59_struct*>(random_state_in);
mkl_rng::mcg59* engine = static_cast<mkl_rng::mcg59*>(random_state->engine);

// perform generation with MCG59 engine
event_ref = dpnp_rng_generate(distribution, *engine, size, result);
}
return event_ref;
}

template <typename _DataType>
void dpnp_rng_normal_c(void* result, const _DataType mean, const _DataType stddev, const size_t size)
{
DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(&DPNP_QUEUE);
sycl::queue* q = &DPNP_QUEUE;
DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(q);
DPCTLEventVectorRef dep_event_vec_ref = nullptr;
mt19937_struct* mt19937 = new mt19937_struct();
mt19937->engine = &DPNP_RNG_ENGINE;
DPCTLSyclEventRef event_ref = nullptr;

DPCTLSyclEventRef event_ref = dpnp_rng_normal_c<_DataType>(q_ref,
result,
mean,
stddev,
static_cast<int64_t>(size),
mt19937,
dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mt19937;
if (q->get_device().is_cpu())
{
mt19937_struct* mt19937 = new mt19937_struct();
mt19937->engine = &DPNP_RNG_ENGINE;

event_ref = dpnp_rng_normal_c<_DataType>(
q_ref, result, mean, stddev, static_cast<int64_t>(size), mt19937, dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mt19937;
}
else
{
// MCG59 engine is assumed to provide a better performance on GPU than MT19937
mcg59_struct* mcg59 = new mcg59_struct();
mcg59->engine = &DPNP_RNG_MCG59_ENGINE;

event_ref = dpnp_rng_normal_c<_DataType>(
q_ref, result, mean, stddev, static_cast<int64_t>(size), mcg59, dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mcg59;
}
}

template <typename _DataType>
Expand Down Expand Up @@ -2149,74 +2229,75 @@ DPCTLSyclEventRef dpnp_rng_uniform_c(DPCTLSyclQueueRef q_ref,
return event_ref;
}

sycl::queue *q = reinterpret_cast<sycl::queue *>(q_ref);
sycl::queue* q = reinterpret_cast<sycl::queue*>(q_ref);

mt19937_struct* random_state = static_cast<mt19937_struct *>(random_state_in);
_DataType* result = static_cast<_DataType *>(result_out);
_DataType* result = static_cast<_DataType*>(result_out);

// set left bound of distribution
const _DataType a = static_cast<_DataType>(low);
// set right bound of distribution
const _DataType b = static_cast<_DataType>(high);

mkl_rng::mt19937 *engine = static_cast<mkl_rng::mt19937 *>(random_state->engine);

if constexpr (std::is_same<_DataType, int32_t>::value) {
if (q->get_device().has(sycl::aspect::fp64)) {
/**
* A note from oneMKL for oneapi::mkl::rng::uniform (Discrete):
* The oneapi::mkl::rng::uniform_method::standard uses the s BRNG type on GPU devices.
* This might cause the produced numbers to have incorrect statistics (due to rounding error)
* when abs(b-a) > 2^23 || abs(b) > 2^23 || abs(a) > 2^23. To get proper statistics for this case,
* use the oneapi::mkl::rng::uniform_method::accurate method instead.
*/
using method_type = mkl_rng::uniform_method::accurate;
mkl_rng::uniform<_DataType, method_type> distribution(a, b);
if (q->get_device().is_cpu())
{
mt19937_struct* random_state = static_cast<mt19937_struct*>(random_state_in);
mkl_rng::mt19937* engine = static_cast<mkl_rng::mt19937*>(random_state->engine);

// perform generation
try {
auto event = mkl_rng::generate<mkl_rng::uniform<_DataType, method_type>, mkl_rng::mt19937>(
distribution, *engine, size, result);
event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
return DPCTLEvent_Copy(event_ref);
} catch (const oneapi::mkl::unsupported_device&) {
// fall through to try with uniform_method::standard
} catch (const oneapi::mkl::unimplemented&) {
// fall through to try with uniform_method::standard
} catch (const std::exception &e) {
// TODO: add error reporting
return event_ref;
}
}
// perform generation with MT19937 engine
event_ref = dpnp_rng_generate_uniform(*engine, q, a, b, size, result);
}
else
{
mcg59_struct* random_state = static_cast<mcg59_struct*>(random_state_in);
mkl_rng::mcg59* engine = static_cast<mkl_rng::mcg59*>(random_state->engine);

// uniform_method::standard is a method used by default
using method_type = mkl_rng::uniform_method::standard;
mkl_rng::uniform<_DataType, method_type> distribution(a, b);

// perform generation
return dpnp_rng_generate<mkl_rng::uniform<_DataType, method_type>, mkl_rng::mt19937, _DataType>(
distribution, *engine, size, result);
// perform generation with MCG59 engine
event_ref = dpnp_rng_generate_uniform(*engine, q, a, b, size, result);
}
return event_ref;
}

template <typename _DataType>
void dpnp_rng_uniform_c(void* result, const long low, const long high, const size_t size)
{
DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(&DPNP_QUEUE);
sycl::queue* q = &DPNP_QUEUE;
DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(q);
DPCTLEventVectorRef dep_event_vec_ref = nullptr;
mt19937_struct* mt19937 = new mt19937_struct();
mt19937->engine = &DPNP_RNG_ENGINE;
DPCTLSyclEventRef event_ref = nullptr;

DPCTLSyclEventRef event_ref = dpnp_rng_uniform_c<_DataType>(q_ref,
result,
static_cast<double>(low),
static_cast<double>(high),
static_cast<int64_t>(size),
mt19937,
dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mt19937;
if (q->get_device().is_cpu())
{
mt19937_struct* mt19937 = new mt19937_struct();
mt19937->engine = &DPNP_RNG_ENGINE;

event_ref = dpnp_rng_uniform_c<_DataType>(q_ref,
result,
static_cast<double>(low),
static_cast<double>(high),
static_cast<int64_t>(size),
mt19937,
dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mt19937;
}
else
{
// MCG59 engine is assumed to provide a better performance on GPU than MT19937
mcg59_struct* mcg59 = new mcg59_struct();
mcg59->engine = &DPNP_RNG_MCG59_ENGINE;

event_ref = dpnp_rng_uniform_c<_DataType>(q_ref,
result,
static_cast<double>(low),
static_cast<double>(high),
static_cast<int64_t>(size),
mcg59,
dep_event_vec_ref);
DPCTLEvent_WaitAndThrow(event_ref);
DPCTLEvent_Delete(event_ref);
delete mcg59;
}
}

template <typename _DataType>
Expand Down
15 changes: 14 additions & 1 deletion dpnp/backend/src/dpnp_random_state.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//*****************************************************************************
// Copyright (c) 2022, Intel Corporation
// Copyright (c) 2022-2023, Intel Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -52,3 +52,16 @@ void MT19937_Delete(mt19937_struct *mt19937) {
mt19937->engine = nullptr;
delete engine;
}

void MCG59_InitScalarSeed(mcg59_struct* mcg59, DPCTLSyclQueueRef q_ref, uint64_t seed)
{
sycl::queue* q = reinterpret_cast<sycl::queue*>(q_ref);
mcg59->engine = new mkl_rng::mcg59(*q, seed);
}

void MCG59_Delete(mcg59_struct* mcg59)
{
mkl_rng::mcg59* engine = static_cast<mkl_rng::mcg59*>(mcg59->engine);
mcg59->engine = nullptr;
delete engine;
}
Loading