From 36defde86f4fa0d2b73a259ce1de7ad08cd1a83c Mon Sep 17 00:00:00 2001 From: chethega Date: Sun, 23 Feb 2020 21:18:00 +0100 Subject: [PATCH 1/2] Introduce task-local and free-standing xoshiro. Second version after rebase/rewrite of large parts. --- src/jltypes.c | 16 +- src/julia.h | 5 + src/task.c | 80 +++++++ stdlib/Random/src/RNGs.jl | 144 +++++++++++++ stdlib/Random/src/Random.jl | 3 +- stdlib/Random/src/XoshiroSimd.jl | 351 +++++++++++++++++++++++++++++++ 6 files changed, 594 insertions(+), 5 deletions(-) create mode 100644 stdlib/Random/src/XoshiroSimd.jl diff --git a/src/jltypes.c b/src/jltypes.c index 34210d15bf43f..47c017287ce46 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -2451,7 +2451,7 @@ void jl_init_types(void) JL_GC_DISABLED NULL, jl_any_type, jl_emptysvec, - jl_perm_symsvec(11, + jl_perm_symsvec(15, "next", "queue", "storage", @@ -2462,8 +2462,12 @@ void jl_init_types(void) JL_GC_DISABLED "backtrace", "logstate", "code", - "sticky"), - jl_svec(11, + "sticky", + "rngState0", + "rngState1", + "rngState2", + "rngState3"), + jl_svec(15, jl_any_type, jl_any_type, jl_any_type, @@ -2474,7 +2478,11 @@ void jl_init_types(void) JL_GC_DISABLED jl_any_type, jl_any_type, jl_any_type, - jl_bool_type), + jl_bool_type, + jl_uint64_type, + jl_uint64_type, + jl_uint64_type, + jl_uint64_type), 0, 1, 9); jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type); jl_svecset(jl_task_type->types, 0, listt); diff --git a/src/julia.h b/src/julia.h index fe66a72b460ec..f331e4abd5d53 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1799,6 +1799,11 @@ typedef struct _jl_task_t { jl_value_t *logstate; jl_function_t *start; uint8_t sticky; // record whether this Task can be migrated to a new thread + uint64_t rngState0; // really rngState[4], but more convenient to split + uint64_t rngState1; + uint64_t rngState2; + uint64_t rngState3; + // hidden state: // id of owning thread - does not need to be defined until the task runs diff --git a/src/task.c b/src/task.c index 72debeefc2c7d..52453a4acc4c8 100644 --- a/src/task.c +++ b/src/task.c @@ -35,6 +35,7 @@ #include "julia_internal.h" #include "threading.h" #include "julia_assert.h" +#include "support/hashing.h" #ifdef __cplusplus extern "C" { @@ -608,6 +609,83 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED) throw_internal(NULL); } +/* This is xoshiro256++ 1.0, used for tasklocal random number generation in julia. + This implementation is intended for embedders and internal use by the runtime, and is + based on the reference implementation on http://prng.di.unimi.it + + Credits go to Sebastiano Vigna for coming up with this PRNG. + + There is a pure julia implementation in stdlib that tends to be faster when used from + within julia, due to inlining and more agressive architecture-specific optimizations. +*/ +JL_DLLEXPORT uint64_t jl_tasklocal_genrandom(jl_task_t *task) { + uint64_t s0 = task->rngState0; + uint64_t s1 = task->rngState1; + uint64_t s2 = task->rngState2; + uint64_t s3 = task->rngState3; + + uint64_t t = s0 << 17; + uint64_t tmp = s0 + s3; + uint64_t res = ((tmp << 23) | (tmp >> 41)) + s0; + s2 ^=s0; + s3 ^=s1; + s1 ^= s2; + s0 ^= s3; + s2 ^= t; + s3 = (s3 << 45) | (s3 >> 19); + + task->rngState0 = s0; + task->rngState1 = s1; + task->rngState2 = s2; + task->rngState3 = s3; + return res; +}; + +JL_DLLEXPORT void jl_tasklocal_seedrandom(jl_task_t *task, + uint64_t seed0, + uint64_t seed1, + uint64_t seed2, + uint64_t seed3) { + // Fixme: Consider a less ad-hoc construction + // We can afford burning a handful of cycles here, and we don't want any + // surprises with respect to bad seeds / bad interactions + uint64_t s = int64hash(seed0); + task->rngState0 = s; + s += int64hash(seed1); + task->rngState1 = s; + s += int64hash(seed2); + task->rngState2 = s; + s += int64hash(seed3); + task->rngState3 = s; + jl_tasklocal_genrandom(task); + jl_tasklocal_genrandom(task); + jl_tasklocal_genrandom(task); + jl_tasklocal_genrandom(task); +}; + +void rng_split(jl_task_t *from, jl_task_t *to) { + /* Fixme: consider a less ad-hoc construction + Ideally we could just use the output of the random stream to seed the initial + state of the child. Out of an overabundance of caution we multiply with + effectively random coefficients, to break possible self-interactions. + + It is not the goal to mix bits -- we work under the assumption that the + source is well-seeded, and its output looks effectively random. + However, xoshiro has never been studied in the mode where we seed the + initial state with the output of another xoshiro instance. + + Constants have nothing up their sleeve: + 0x02011ce34bce797f == hash(UInt(1))|0x01 + 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + */ + to -> rngState0 = 0x02011ce34bce797f * jl_tasklocal_genrandom(from); + to -> rngState1 = 0x5a94851fb48a6e05 * jl_tasklocal_genrandom(from); + to -> rngState2 = 0x3688cf5d48899fa7 * jl_tasklocal_genrandom(from); + to -> rngState3 = 0x867b4bb4c42e5661 * jl_tasklocal_genrandom(from); +} + JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) { jl_ptls_t ptls = jl_get_ptls_states(); @@ -643,6 +721,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->backtrace = jl_nothing; // Inherit logger state from parent task t->logstate = ptls->current_task->logstate; + // Fork task-local random state from parent + rng_split(ptls->current_task, t); // there is no active exception handler available on this stack yet t->eh = NULL; t->sticky = 1; diff --git a/stdlib/Random/src/RNGs.jl b/stdlib/Random/src/RNGs.jl index 281acad533dad..1d5da84b79ade 100644 --- a/stdlib/Random/src/RNGs.jl +++ b/stdlib/Random/src/RNGs.jl @@ -172,6 +172,149 @@ function fillcache_zeros!(r::MersenneTwister) r end +## Xoshiro RNG. Unless this becomes the new default, consider excising to external library +#Lots of implementation is shared with TaskLocal +""" + Xoshiro + +Xoshiro256++ is a fast pseudorandom number generator originally developed by Sebastian Vigna. Reference implementation is available on on http://prng.di.unimi.it + +Apart from the high speed, Xoshiro has a small memory footprint, making it suitable for applications where many different random states need to be held for long time. + +Julia's Xoshiro implementation has a bulk-generation mode; this seeds new virtual PRNGs from the parent, and uses SIMD to generate in parallel (i.e. the bulk stream consists of multiple interleaved xoshiro instances). The virtual PRNGs are discarded once the bulk request has been serviced (and should cause no heap allocations). +""" +mutable struct Xoshiro <: AbstractRNG + s0::UInt64 + s1::UInt64 + s2::UInt64 + s3::UInt64 +end + +Xoshiro(::Nothing) = Xoshiro() + +function Xoshiro(parent::AbstractRNG = RandomDevice()) + # Constants have nothing up their sleeve, cf task.c + # 0x02011ce34bce797f == hash(UInt(1))|0x01 + # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + + Xoshiro(0x02011ce34bce797f * rand(parent, UInt64), + 0x5a94851fb48a6e05 * rand(parent, UInt64), + 0x3688cf5d48899fa7 * rand(parent, UInt64), + 0x867b4bb4c42e5661 * rand(parent, UInt64)) +end + +rng_native_52(::Xoshiro) = UInt64 + +function seed!(rng::Xoshiro, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) + #cf task.c + s = Base.hash_uint64(s0) + rng.s0 = s + s += Base.hash_uint64(s1) + rng.s1 = s + s += Base.hash_uint64(s2) + rng.s2 = s + s += Base.hash_uint64(s3) + rng.s3 = s +end + + +@inline function rand(rng::Xoshiro, ::Type{UInt64}) + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + tmp = s0 + s3 + res = tmp << 23 | tmp >> 41 + t = s1 << 17 + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = s3 << 45 | s3 >> 19 + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + res +end + + +## Task local RNG +""" + TaskLocal + +The TaskLocal RNG has state that is local to its task, not its thread. It is seeded upon task creation, from the state of its parent task. Therefore, task creation is an event that changes the parent's RNG state. + +As an upside, the TaskLocal RNG is pretty fast, and permits reproducible multithreaded simulations (barring race conditions), independent of scheduler decisions. As long as the number of threads is not used to make decisions on task creation, simulation results are also independent of the number of available threads / CPUs. The random stream should not depend on hardware specifics, up to endianness and possibly word size. + +Using or seeding the RNG of any other task than the one returned by `current_task()` is undefined behavior: it will work most of the time, and may sometimes fail silently. +""" +struct TaskLocal <: AbstractRNG end +TaskLocal(::Nothing) = TaskLocal() +rng_native_52(::TaskLocal) = UInt64 + +function seed!(::TaskLocal, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) + task = current_task() + ccall(:jl_tasklocal_seedrandom, Nothing, (Ref{Task}, UInt64, UInt64, UInt64, UInt64), task, s0,s1,s2,s3) + TaskLocal() +end + + +@inline function rand(::TaskLocal, ::Type{UInt64}) + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + tmp = s0 + s3 + res = tmp << 23 | tmp >> 41 + t = s1 << 17 + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = s3 << 45 | s3 >> 19 + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + res +end + +#Shared implementation between Xoshiro and TaskLocal -- seeding +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::Union{Int128, UInt128, BigInt}) + seed0 = seed % UInt64 + seed1 = (seed>>64) % UInt64 + seed!(rng, seed0, seed1, zero(UInt64), zero(UInt64)) +end +seed!(rng::Union{TaskLocal, Xoshiro}, seed::Integer) = seed!(rng, seed%UInt64, zero(UInt64), zero(UInt64), zero(UInt64)) +seed!(rng::Union{TaskLocal, Xoshiro}, ::Nothing) = seed!(rng) + +seed!(rng::Union{TaskLocal, Xoshiro}) = seed!(rng, + rand(RandomDevice(), UInt64), + rand(RandomDevice(), UInt64), + rand(RandomDevice(), UInt64), + rand(RandomDevice(), UInt64)) + +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt64}) + seed0 = length(seed)>0 ? seed[1] : UInt64(0) + seed1 = length(seed)>1 ? seed[2] : UInt64(0) + seed2 = length(seed)>2 ? seed[3] : UInt64(0) + seed3 = length(seed)>3 ? seed[4] : UInt64(0) + seed!(rng, seed0, seed1, seed2, seed3) +end + +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt32}) + #can only process an even length + len = min(length(seed), 8) & ~1 + seed!(rng, reinterpret(UInt64, view(seed, 1:len))) +end + +@inline function rand(rng::Union{TaskLocal, Xoshiro}, ::Type{UInt128}) + first = rand(rng, UInt64) + second = rand(rng,UInt64) + second + UInt128(first)<<64 +end +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{UInt128}) = rand(rng, UInt128) +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::Type{Int128}) = rand(rng, UInt128) % Int128 +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128 + +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::Type{T}) where {T<:Union{Bool, UInt8, Int8, UInt16, Int16, UInt32, Int32, Int64}} = rand(rng, UInt64) % T +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{T}) where {T<:Union{Bool, UInt8, Int8, UInt16, Int16, UInt32, Int32, Int64, UInt64}} = rand(rng, UInt64) % T + +Sampler(rng::Union{TaskLocal, Xoshiro}, ::Type{T}) where T<:Union{Float64, Float32} = SamplerType{T}() ### low level API @@ -310,6 +453,7 @@ end function __init__() resize!(empty!(THREAD_RNGs), Threads.nthreads()) # ensures that we didn't save a bad object + seed!(TaskLocal()) end diff --git a/stdlib/Random/src/Random.jl b/stdlib/Random/src/Random.jl index 5197ac1c34e7b..d1f165511f8cc 100644 --- a/stdlib/Random/src/Random.jl +++ b/stdlib/Random/src/Random.jl @@ -27,7 +27,7 @@ export rand!, randn!, shuffle, shuffle!, randperm, randperm!, randcycle, randcycle!, - AbstractRNG, MersenneTwister, RandomDevice + AbstractRNG, MersenneTwister, RandomDevice, TaskLocal ## general definitions @@ -296,6 +296,7 @@ include("RNGs.jl") include("generation.jl") include("normal.jl") include("misc.jl") +include("XoshiroSimd.jl") ## rand & rand! & seed! docstrings diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl new file mode 100644 index 0000000000000..7509b70a80a0b --- /dev/null +++ b/stdlib/Random/src/XoshiroSimd.jl @@ -0,0 +1,351 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +module XoshiroSimd +# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl. +import ..Random: TaskLocal, rand, rand!, UnsafeView, Xoshiro, SamplerType + + + +#Vector-width. Influences random stream. We may want to tune this before merging. +xoshiroWidth() = Val(4) +#Simd threshold. Influences random stream. We may want to tune this before merging. +simdThreshold(::Type{T}) where T = 2048 +simdThreshold(::Type{Bool}) = 4096 + +@inline _rotl45(x::UInt64) = (x<<45)|(x>>19) +@inline _shl17(x::UInt64) = x<<17 +@inline _rotl23(x::UInt64) = (x<<23)|(x>>41) +@inline _plus(x::UInt64,y::UInt64) = x+y +@inline _xor(x::UInt64,y::UInt64) = xor(x,y) +@inline _and(x::UInt64, y::UInt64) = x & y +@inline _or(x::UInt64, y::UInt64) = x | y +@inline _lshr(x, y::Int32) = _lshr(x, y % Int64) +@inline _lshr(x::UInt64, y::Int64) = Core.Intrinsics.llvmcall(""" + %res = lshr i64 %0, %1 + ret i64 %res + """, + UInt64, + Tuple{UInt64, Int64}, + x, y) + + + +# required operations. These could be written more concise with `ntuple`, but the compiler +# sometimes refuses to properly vectorize. +for N in [1,2,4,8,16,32] + let + local code + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %rshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %lshifted = shl <$N x i64> %0, %lshiftOp + %rshifted = lshr <$N x i64> %0, %rshiftOp + %res = or <$N x i64> %lshifted, %rshifted + ret <$N x i64> %res + """ + @eval @inline _rotl45(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %res = shl <$N x i64> %0, %lshiftOp + ret <$N x i64> %res + """ + @eval @inline _shl17(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %rshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %lshifted = shl <$N x i64> %0, %lshiftOp + %rshifted = lshr <$N x i64> %0, %rshiftOp + %res = or <$N x i64> %lshifted, %rshifted + ret <$N x i64> %res + """ + @eval @inline _rotl23(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %res = add <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _plus(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = xor <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _xor(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = and <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _and(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = or <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _or(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %tmp = insertelement <1 x i64> undef, i64 %1, i32 0 + %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer + %res = lshr <$N x i64> %0, %shift + ret <$N x i64> %res + """ + @eval @inline _lshr(x::NTuple{$N, VecElement{UInt64}}, y::Int64) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, Int64}, + x, y) + + end +end + + + + + + +#The mast values are used as res = _or(_and(res, mskBits), oneBits) + +mskBits(::Type{T}) where T = 0xffffffffffffffff +mskBits(::Type{T}, ::Val{N}) where {T,N} = ntuple(i-> VecElement(mskBits(T)), Val(N)) + +oneBits(::Type{T}) where T = 0x0000000000000000 +oneBits(::Type{T}, ::Val{N}) where {T,N} = ntuple(i-> VecElement(oneBits(T)), Val(N)) + +mskBits(::Type{Float64}) = Base.significand_mask(Float64) +oneBits(::Type{Float64}) = Base.exponent_one(Float64) + +mskBits(::Type{Float32}) = Base.significand_mask(Float32) + UInt64(Base.significand_mask(Float32))<<32 +oneBits(::Type{Float32}) = Base.exponent_one(Float32) + UInt64(Base.exponent_one(Float32))<<32 + + +function forkRand(rng::Union{TaskLocal, Xoshiro}, ::Val{N}) where N + # constants have nothing up their sleeve. For more discussion, cf rng_split in task.c + # 0x02011ce34bce797f == hash(UInt(1))|0x01 + # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N)) + s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N)) + s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N)) + s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N)) + (s0, s1, s2, s3) +end + +@inline function taskLocalRngBulk(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T<:Union{UInt8, Float32, Float64, Bool}, N} + if len >= simdThreshold(T) + written = taskLocalRngBulk_simd(rng, dst, len, T, Val(N)) + len -= written + dst += written + end + if len != 0 + taskLocalRngBulk_nosimd(rng, dst, len, T) + end + nothing +end + +@noinline function taskLocalRngBulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}) where {T} + #we rely on specialization, llvm const-prop + instcombine to remove unneeded masking + mskOR = oneBits(T) + mskAND = mskBits(T) + if rng isa TaskLocal + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + else + rng::Xoshiro + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + end + + i = 0 + while i+8 <= len + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + i += 8 + end + if(i < len) + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + ref = Ref(res) + #fixme: This may make the random-stream dependent on system endianness + GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i) + end + if rng isa TaskLocal + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + else + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + end + +end + +@noinline function taskLocalRngBulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}) + if rng isa TaskLocal + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + else + rng::Xoshiro + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + end + + i = 0 + while i+8 <= len + res = _rotl23(_plus(s0,s3)) + shift = 0 + while i+8 <= len && shift < 8 + resLoc = _and(_lshr(res, shift), 0x0101010101010101) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res) + i += 8 + shift += 1 + end + + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + end + if(i < len) + #we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0 + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + while i+8 <= len + resLoc = _and(res, 0x0101010101010101) + res = _lshr(res, 1) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc) + i += 8 + end + resLoc = _and(res, 0x0101010101010101) + ref = Ref(resLoc) + GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + end + if rng isa TaskLocal + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + else + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + end + +end + + + +@noinline function taskLocalRngBulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T,N} + #we rely on specialization, llvm const-prop + instcombine to remove unneeded masking + mskOR = oneBits(T, Val(N)) + mskAND = mskBits(T, Val(N)) + + s0, s1, s2, s3 = forkRand(rng, Val(N)) + + i = 0 + while i + 8*N <= len + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), res) + i += 8*N + end + return i +end +@noinline function taskLocalRngBulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}) where {N} + + s0, s1, s2, s3 = forkRand(rng, Val(N)) + msk = ntuple(i->VecElement(0x0101010101010101), Val(N)) + i = 0 + while i + 64*N <= len + res = _rotl23(_plus(s0,s3)) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + for k=0:7 + tmp = _lshr(res, k) + toWrite = _and(tmp, msk) + unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite) + end + i += 64*N + end + return i +end + + + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float32}, UnsafeView{Float32}}, ::SamplerType{Float32} = SamplerType{Float32}()) + GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth()) + @inbounds @simd for i =1:length(dst) + dst[i] = dst[i] - 1.0f0 + end + dst +end + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float64}, UnsafeView{Float64}}, ::SamplerType{Float64} = SamplerType{Float64}()) + GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth()) + @inbounds @simd for i =1:length(dst) + dst[i] = dst[i] - 1.0e0 + end + dst +end + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{T}, UnsafeView{T}}, ::SamplerType{T} = SamplerType{T}()) where {T<:Base.BitInteger} + GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof(T), UInt8, xoshiroWidth()) + dst +end + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Bool}, UnsafeView{Bool}}, ::SamplerType{Bool} = SamplerType{Bool}()) + GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth()) + dst +end + +end # module From 4d0207eff7f5e4c2d6976cb4c801f2430e1666ce Mon Sep 17 00:00:00 2001 From: chethega Date: Thu, 13 Aug 2020 02:25:57 +0200 Subject: [PATCH 2/2] increase vector width to 8, rename some internal functions --- stdlib/Random/src/XoshiroSimd.jl | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl index 7509b70a80a0b..dd0b6b2cf7ca7 100644 --- a/stdlib/Random/src/XoshiroSimd.jl +++ b/stdlib/Random/src/XoshiroSimd.jl @@ -7,7 +7,7 @@ import ..Random: TaskLocal, rand, rand!, UnsafeView, Xoshiro, SamplerType #Vector-width. Influences random stream. We may want to tune this before merging. -xoshiroWidth() = Val(4) +xoshiroWidth() = Val(8) #Simd threshold. Influences random stream. We may want to tune this before merging. simdThreshold(::Type{T}) where T = 2048 simdThreshold(::Type{Bool}) = 4096 @@ -154,19 +154,19 @@ function forkRand(rng::Union{TaskLocal, Xoshiro}, ::Val{N}) where N (s0, s1, s2, s3) end -@inline function taskLocalRngBulk(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T<:Union{UInt8, Float32, Float64, Bool}, N} +@inline function xoshiro_bulk(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T<:Union{UInt8, Float32, Float64, Bool}, N} if len >= simdThreshold(T) - written = taskLocalRngBulk_simd(rng, dst, len, T, Val(N)) + written = xoshiro_bulk_simd(rng, dst, len, T, Val(N)) len -= written dst += written end if len != 0 - taskLocalRngBulk_nosimd(rng, dst, len, T) + xoshiro_bulk_nosimd(rng, dst, len, T) end nothing end -@noinline function taskLocalRngBulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}) where {T} +@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}) where {T} #we rely on specialization, llvm const-prop + instcombine to remove unneeded masking mskOR = oneBits(T) mskAND = mskBits(T) @@ -211,10 +211,10 @@ end else rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 end - + nothing end -@noinline function taskLocalRngBulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}) +@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}) if rng isa TaskLocal task = current_task() s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 @@ -245,7 +245,6 @@ end if(i < len) #we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0 res = _rotl23(_plus(s0,s3)) - res = _or(_and(res, mskAND), mskOR) while i+8 <= len resLoc = _and(res, 0x0101010101010101) res = _lshr(res, 1) @@ -268,12 +267,12 @@ end else rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 end - + nothing end -@noinline function taskLocalRngBulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T,N} +@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T,N} #we rely on specialization, llvm const-prop + instcombine to remove unneeded masking mskOR = oneBits(T, Val(N)) mskAND = mskBits(T, Val(N)) @@ -296,7 +295,7 @@ end end return i end -@noinline function taskLocalRngBulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}) where {N} +@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}) where {N} s0, s1, s2, s3 = forkRand(rng, Val(N)) msk = ntuple(i->VecElement(0x0101010101010101), Val(N)) @@ -323,7 +322,7 @@ end function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float32}, UnsafeView{Float32}}, ::SamplerType{Float32} = SamplerType{Float32}()) - GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth()) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth()) @inbounds @simd for i =1:length(dst) dst[i] = dst[i] - 1.0f0 end @@ -331,7 +330,7 @@ function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float32}, Unsafe end function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float64}, UnsafeView{Float64}}, ::SamplerType{Float64} = SamplerType{Float64}()) - GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth()) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth()) @inbounds @simd for i =1:length(dst) dst[i] = dst[i] - 1.0e0 end @@ -339,12 +338,12 @@ function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float64}, Unsafe end function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{T}, UnsafeView{T}}, ::SamplerType{T} = SamplerType{T}()) where {T<:Base.BitInteger} - GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof(T), UInt8, xoshiroWidth()) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof(T), UInt8, xoshiroWidth()) dst end function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Bool}, UnsafeView{Bool}}, ::SamplerType{Bool} = SamplerType{Bool}()) - GC.@preserve dst taskLocalRngBulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth()) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth()) dst end