From bce6397a15a4ce4a40269d498ef1afe14e660710 Mon Sep 17 00:00:00 2001 From: chethega Date: Sun, 23 Feb 2020 21:18:00 +0100 Subject: [PATCH] Introduce task-local and free-standing xoshiro RNG --- src/jltypes.c | 16 +- src/julia.h | 4 + src/task.c | 60 ++++++ stdlib/Random/src/RNGs.jl | 211 +++++++++++++++++++ stdlib/Random/src/Random.jl | 3 +- stdlib/Random/src/XoshiroSimd.jl | 344 +++++++++++++++++++++++++++++++ test/threads_exec.jl | 25 +++ 7 files changed, 658 insertions(+), 5 deletions(-) create mode 100644 stdlib/Random/src/XoshiroSimd.jl diff --git a/src/jltypes.c b/src/jltypes.c index 2ba69caf1991ed..00598504d5062e 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -2462,7 +2462,7 @@ void jl_init_types(void) JL_GC_DISABLED NULL, jl_any_type, jl_emptysvec, - jl_perm_symsvec(10, + jl_perm_symsvec(14, "next", "queue", "storage", @@ -2472,8 +2472,12 @@ void jl_init_types(void) JL_GC_DISABLED "code", "_state", "sticky", - "_isexception"), - jl_svec(10, + "_isexception", + "rngState0", + "rngState1", + "rngState2", + "rngState3"), + jl_svec(14, jl_any_type, jl_any_type, jl_any_type, @@ -2483,7 +2487,11 @@ void jl_init_types(void) JL_GC_DISABLED jl_any_type, jl_uint8_type, jl_bool_type, - jl_bool_type), + jl_bool_type, + jl_uint64_type, + jl_uint64_type, + jl_uint64_type, + jl_uint64_type), 0, 1, 6); jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type); jl_svecset(jl_task_type->types, 0, listt); diff --git a/src/julia.h b/src/julia.h index 1e0d96af669c73..a39949515a306c 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1809,6 +1809,10 @@ typedef struct _jl_task_t { uint8_t _state; uint8_t sticky; // record whether this Task can be migrated to a new thread uint8_t _isexception; // set if `result` is an exception to throw or that we exited with + uint64_t rngState0; // really rngState[4], but more convenient to split + uint64_t rngState1; + uint64_t rngState2; + uint64_t rngState3; // hidden state: // id of owning thread - does not need to be defined until the task runs diff --git a/src/task.c b/src/task.c index 4d38d30f5cc56e..da2662b7457efa 100644 --- a/src/task.c +++ b/src/task.c @@ -35,6 +35,7 @@ #include "julia_internal.h" #include "threading.h" #include "julia_assert.h" +#include "support/hashing.h" #ifdef __cplusplus extern "C" { @@ -666,6 +667,63 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED) throw_internal(NULL); } +/* This is xoshiro256++ 1.0, used for tasklocal random number generation in julia. + This implementation is intended for embedders and internal use by the runtime, and is + based on the reference implementation on http://prng.di.unimi.it + + Credits go to Sebastiano Vigna for coming up with this PRNG. + + There is a pure julia implementation in stdlib that tends to be faster when used from + within julia, due to inlining and more agressive architecture-specific optimizations. +*/ +JL_DLLEXPORT uint64_t jl_tasklocal_genrandom(jl_task_t *task) JL_NOTSAFEPOINT +{ + uint64_t s0 = task->rngState0; + uint64_t s1 = task->rngState1; + uint64_t s2 = task->rngState2; + uint64_t s3 = task->rngState3; + + uint64_t t = s0 << 17; + uint64_t tmp = s0 + s3; + uint64_t res = ((tmp << 23) | (tmp >> 41)) + s0; + s2 ^= s0; + s3 ^= s1; + s1 ^= s2; + s0 ^= s3; + s2 ^= t; + s3 = (s3 << 45) | (s3 >> 19); + + task->rngState0 = s0; + task->rngState1 = s1; + task->rngState2 = s2; + task->rngState3 = s3; + return res; +} + +void rng_split(jl_task_t *from, jl_task_t *to) JL_NOTSAFEPOINT +{ + /* TODO: consider a less ad-hoc construction + Ideally we could just use the output of the random stream to seed the initial + state of the child. Out of an overabundance of caution we multiply with + effectively random coefficients, to break possible self-interactions. + + It is not the goal to mix bits -- we work under the assumption that the + source is well-seeded, and its output looks effectively random. + However, xoshiro has never been studied in the mode where we seed the + initial state with the output of another xoshiro instance. + + Constants have nothing up their sleeve: + 0x02011ce34bce797f == hash(UInt(1))|0x01 + 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + */ + to->rngState0 = 0x02011ce34bce797f * jl_tasklocal_genrandom(from); + to->rngState1 = 0x5a94851fb48a6e05 * jl_tasklocal_genrandom(from); + to->rngState2 = 0x3688cf5d48899fa7 * jl_tasklocal_genrandom(from); + to->rngState3 = 0x867b4bb4c42e5661 * jl_tasklocal_genrandom(from); +} + JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) { jl_ptls_t ptls = jl_get_ptls_states(); @@ -701,6 +759,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->_isexception = 0; // Inherit logger state from parent task t->logstate = ptls->current_task->logstate; + // Fork task-local random state from parent + rng_split(ptls->current_task, t); // there is no active exception handler available on this stack yet t->eh = NULL; t->sticky = 1; diff --git a/stdlib/Random/src/RNGs.jl b/stdlib/Random/src/RNGs.jl index 5c29954f131321..6aede595e0415d 100644 --- a/stdlib/Random/src/RNGs.jl +++ b/stdlib/Random/src/RNGs.jl @@ -216,6 +216,216 @@ function reset_caches!(r::MersenneTwister) r end + +## Xoshiro RNG +# Lots of implementation is shared with TaskLocal + +""" + Xoshiro + +Xoshiro256++ is a fast pseudorandom number generator originally developed by Sebastian Vigna. +Reference implementation is available on on http://prng.di.unimi.it + +Apart from the high speed, Xoshiro has a small memory footprint, making it suitable for +applications where many different random states need to be held for long time. + +Julia's Xoshiro implementation has a bulk-generation mode; this seeds new virtual PRNGs +from the parent, and uses SIMD to generate in parallel (i.e. the bulk stream consists of +multiple interleaved xoshiro instances). +The virtual PRNGs are discarded once the bulk request has been serviced (and should cause +no heap allocations). +""" +mutable struct Xoshiro <: AbstractRNG + s0::UInt64 + s1::UInt64 + s2::UInt64 + s3::UInt64 +end + +Xoshiro(::Nothing) = Xoshiro() + +function Xoshiro(parent::AbstractRNG = RandomDevice()) + # Constants have nothing up their sleeve, see task.c + # 0x02011ce34bce797f == hash(UInt(1))|0x01 + # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + + Xoshiro(0x02011ce34bce797f * rand(parent, UInt64), + 0x5a94851fb48a6e05 * rand(parent, UInt64), + 0x3688cf5d48899fa7 * rand(parent, UInt64), + 0x867b4bb4c42e5661 * rand(parent, UInt64)) +end + +copy(rng::Xoshiro) = Xoshiro(rng.s0, rng.s1, rng.s2, rng.s3) + +function copy!(dst::Xoshiro, src::Xoshiro) + dst.s0, dst.s1, dst.s2, dst.s3 = src.s0, src.s1, src.s2, src.s3 + dst +end + +function ==(a::Xoshiro, b::Xoshiro) + a.s0 == b.s0 && a.s1 == b.s1 && a.s2 == b.s2 && a.s3 == b.s3 +end + +rng_native_52(::Xoshiro) = UInt64 + +function seed!(rng::Xoshiro, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) + # see task.c + s = Base.hash_uint64(s0) + rng.s0 = s + s += Base.hash_uint64(s1) + rng.s1 = s + s += Base.hash_uint64(s2) + rng.s2 = s + s += Base.hash_uint64(s3) + rng.s3 = s + rng +end + +@inline function rand(rng::Xoshiro, ::SamplerType{UInt64}) + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + tmp = s0 + s3 + res = tmp << 23 | tmp >> 41 + t = s1 << 17 + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = s3 << 45 | s3 >> 19 + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + res +end + + +## Task local RNG + +""" + TaskLocal + +The TaskLocal RNG has state that is local to its task, not its thread. +It is seeded upon task creation, from the state of its parent task. +Therefore, task creation is an event that changes the parent's RNG state. + +As an upside, the TaskLocal RNG is pretty fast, and permits reproducible +multithreaded simulations (barring race conditions), independent of scheduler +decisions. As long as the number of threads is not used to make decisions on +task creation, simulation results are also independent of the number of available +threads / CPUs. The random stream should not depend on hardware specifics, up to +endianness and possibly word size. + +Using or seeding the RNG of any other task than the one returned by `current_task()` +is undefined behavior: it will work most of the time, and may sometimes fail silently. +""" +struct TaskLocal <: AbstractRNG end +TaskLocal(::Nothing) = TaskLocal() +rng_native_52(::TaskLocal) = UInt64 + +function seed!(rng::TaskLocal, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) + # TODO: Consider a less ad-hoc construction + # We can afford burning a handful of cycles here, and we don't want any + # surprises with respect to bad seeds / bad interactions. + t = current_task() + s = hash(s0) + t.rngState0 = s + s += hash(s1) + t.rngState1 = s + s += hash(s2) + t.rngState2 = s + s += hash(s3) + t.rngState3 = s + rand(rng, UInt64) + rand(rng, UInt64) + rand(rng, UInt64) + rand(rng, UInt64) + rng +end + +@inline function rand(::TaskLocal, ::SamplerType{UInt64}) + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + tmp = s0 + s3 + res = tmp << 23 | tmp >> 41 + t = s1 << 17 + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = s3 << 45 | s3 >> 19 + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + res +end + +# Shared implementation between Xoshiro and TaskLocal -- seeding +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::UInt128) + seed0 = seed % UInt64 + seed1 = (seed>>>64) % UInt64 + seed!(rng, seed0, seed1, zero(UInt64), zero(UInt64)) +end +seed!(rng::Union{TaskLocal, Xoshiro}, seed::Integer) = seed!(rng, seed%UInt64, zero(UInt64), zero(UInt64), zero(UInt64)) +seed!(rng::Union{TaskLocal, Xoshiro}, ::Nothing) = seed!(rng) + +seed!(rng::Union{TaskLocal, Xoshiro}) = + seed!(rng, rand(RandomDevice(), UInt64), rand(RandomDevice(), UInt64), + rand(RandomDevice(), UInt64), rand(RandomDevice(), UInt64)) + +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt64}) + if length(seed) > 4 + throw(ArgumentError("seed should have no more than 256 bits")) + end + seed0 = length(seed)>0 ? seed[1] : UInt64(0) + seed1 = length(seed)>1 ? seed[2] : UInt64(0) + seed2 = length(seed)>2 ? seed[3] : UInt64(0) + seed3 = length(seed)>3 ? seed[4] : UInt64(0) + seed!(rng, seed0, seed1, seed2, seed3) +end + +function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt32}) + if iseven(length(seed)) + seed!(rng, reinterpret(UInt64, seed)) + else + seed!(rng, UInt64[reinterpret(UInt64, @view(seed[begin:end-1])); seed[end] % UInt64]) + end +end + +@inline function rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{UInt128}) + first = rand(rng, UInt64) + second = rand(rng,UInt64) + second + UInt128(first)<<64 +end + +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128 + +@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{T}) where {T<:Union{Bool, UInt8, Int8, UInt16, Int16, UInt32, Int32, Int64}} = rand(rng, UInt64) % T + +function copy(rng::TaskLocal) + t = current_task() + Xoshiro(t.rngState0, t.rngState1, t.rngState2, t.rngState3) +end + +function copy!(dst::TaskLocal, src::Xoshiro) + t = current_task() + t.rngState0, t.rngState1, t.rngState2, t.rngState3 = src.s0, src.s1, src.s2, src.s3 + dst +end + +function copy!(dst::Xoshiro, src::TaskLocal) + t = current_task() + dst.s0, dst.s1, dst.s2, dst.s3 = t.rngState0, t.rngState1, t.rngState2, t.rngState3 + dst +end + +function ==(a::Xoshiro, b::TaskLocal) + t = current_task() + a.s0 == t.rngState0 && a.s1 == t.rngState1 && a.s2 == t.rngState2 && a.s3 == t.rngState3 +end + +==(a::TaskLocal, b::Xoshiro) = b == a + +### low level API + #### floats mt_avail(r::MersenneTwister) = MT_CACHE_F - r.idxF @@ -382,6 +592,7 @@ end function __init__() resize!(empty!(THREAD_RNGs), Threads.nthreads()) # ensures that we didn't save a bad object + seed!(TaskLocal()) end diff --git a/stdlib/Random/src/Random.jl b/stdlib/Random/src/Random.jl index 2cdffd60672520..9ff79f6b4972ad 100644 --- a/stdlib/Random/src/Random.jl +++ b/stdlib/Random/src/Random.jl @@ -27,7 +27,7 @@ export rand!, randn!, shuffle, shuffle!, randperm, randperm!, randcycle, randcycle!, - AbstractRNG, MersenneTwister, RandomDevice + AbstractRNG, MersenneTwister, RandomDevice, TaskLocal, Xoshiro ## general definitions @@ -296,6 +296,7 @@ include("RNGs.jl") include("generation.jl") include("normal.jl") include("misc.jl") +include("XoshiroSimd.jl") ## rand & rand! & seed! docstrings diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl new file mode 100644 index 00000000000000..5237767b5c945b --- /dev/null +++ b/stdlib/Random/src/XoshiroSimd.jl @@ -0,0 +1,344 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +module XoshiroSimd +# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl. +import ..Random: TaskLocal, rand, rand!, UnsafeView, Xoshiro, SamplerType, CloseOpen12, SamplerTrivial +using Base: BitInteger_types + +# Vector-width. Influences random stream. We may want to tune this before merging. +xoshiroWidth() = Val(8) +# Simd threshold. Influences random stream. We may want to tune this before merging. +simdThreshold(::Type{T}) where T = 2048 +simdThreshold(::Type{Bool}) = 4096 + +@inline _rotl45(x::UInt64) = (x<<45)|(x>>19) +@inline _shl17(x::UInt64) = x<<17 +@inline _rotl23(x::UInt64) = (x<<23)|(x>>41) +@inline _plus(x::UInt64,y::UInt64) = x+y +@inline _xor(x::UInt64,y::UInt64) = xor(x,y) +@inline _and(x::UInt64, y::UInt64) = x & y +@inline _or(x::UInt64, y::UInt64) = x | y +@inline _lshr(x, y::Int32) = _lshr(x, y % Int64) +@inline _lshr(x::UInt64, y::Int64) = Core.Intrinsics.llvmcall(""" + %res = lshr i64 %0, %1 + ret i64 %res + """, + UInt64, + Tuple{UInt64, Int64}, + x, y) + + +# required operations. These could be written more concise with `ntuple`, but the compiler +# sometimes refuses to properly vectorize. +for N in [1,2,4,8,16,32] + let + local code + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %rshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %lshifted = shl <$N x i64> %0, %lshiftOp + %rshifted = lshr <$N x i64> %0, %rshiftOp + %res = or <$N x i64> %lshifted, %rshifted + ret <$N x i64> %res + """ + @eval @inline _rotl45(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %res = shl <$N x i64> %0, %lshiftOp + ret <$N x i64> %res + """ + @eval @inline _shl17(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %rshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer + %lshifted = shl <$N x i64> %0, %lshiftOp + %rshifted = lshr <$N x i64> %0, %rshiftOp + %res = or <$N x i64> %lshifted, %rshifted + ret <$N x i64> %res + """ + @eval @inline _rotl23(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}}, + x) + + code = """ + %res = add <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _plus(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = xor <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _xor(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = and <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _and(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %res = or <$N x i64> %1, %0 + ret <$N x i64> %res + """ + @eval @inline _or(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}}, + x, y) + + code = """ + %tmp = insertelement <1 x i64> undef, i64 %1, i32 0 + %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer + %res = lshr <$N x i64> %0, %shift + ret <$N x i64> %res + """ + @eval @inline _lshr(x::NTuple{$N, VecElement{UInt64}}, y::Int64) = Core.Intrinsics.llvmcall($code, + NTuple{$N, VecElement{UInt64}}, + Tuple{NTuple{$N, VecElement{UInt64}}, Int64}, + x, y) + + end +end + + +# The mast values are used as res = _or(_and(res, mskBits), oneBits) + +mskBits(::Type{T}) where T = 0xffffffffffffffff +mskBits(::Type{T}, ::Val{N}) where {T,N} = ntuple(i-> VecElement(mskBits(T)), Val(N)) + +oneBits(::Type{T}) where T = 0x0000000000000000 +oneBits(::Type{T}, ::Val{N}) where {T,N} = ntuple(i-> VecElement(oneBits(T)), Val(N)) + +mskBits(::Type{Float64}) = Base.significand_mask(Float64) +oneBits(::Type{Float64}) = Base.exponent_one(Float64) + +mskBits(::Type{Float32}) = Base.significand_mask(Float32) + UInt64(Base.significand_mask(Float32))<<32 +oneBits(::Type{Float32}) = Base.exponent_one(Float32) + UInt64(Base.exponent_one(Float32))<<32 + + +function forkRand(rng::Union{TaskLocal, Xoshiro}, ::Val{N}) where N + # constants have nothing up their sleeve. For more discussion, cf rng_split in task.c + # 0x02011ce34bce797f == hash(UInt(1))|0x01 + # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 + # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 + # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 + s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N)) + s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N)) + s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N)) + s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N)) + (s0, s1, s2, s3) +end + +@inline function xoshiro_bulk(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T<:Union{UInt8, Float32, Float64, Bool}, N} + if len >= simdThreshold(T) + written = xoshiro_bulk_simd(rng, dst, len, T, Val(N)) + len -= written + dst += written + end + if len != 0 + xoshiro_bulk_nosimd(rng, dst, len, T) + end + nothing +end + +@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}) where {T} + # we rely on specialization, llvm const-prop + instcombine to remove unneeded masking + mskOR = oneBits(T) + mskAND = mskBits(T) + if rng isa TaskLocal + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + else + rng::Xoshiro + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + end + + i = 0 + while i+8 <= len + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + i += 8 + end + if i < len + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + ref = Ref(res) + # TODO: This may make the random-stream dependent on system endianness + GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i) + end + if rng isa TaskLocal + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + else + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + end + nothing +end + +@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}) + if rng isa TaskLocal + task = current_task() + s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3 + else + rng::Xoshiro + s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3 + end + + i = 0 + while i+8 <= len + res = _rotl23(_plus(s0,s3)) + shift = 0 + while i+8 <= len && shift < 8 + resLoc = _and(_lshr(res, shift), 0x0101010101010101) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res) + i += 8 + shift += 1 + end + + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + end + if i < len + # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0 + res = _rotl23(_plus(s0,s3)) + while i+8 <= len + resLoc = _and(res, 0x0101010101010101) + res = _lshr(res, 1) + unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc) + i += 8 + end + resLoc = _and(res, 0x0101010101010101) + ref = Ref(resLoc) + GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + end + if rng isa TaskLocal + task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 + else + rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3 + end + nothing +end + + +@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T,N} + # we rely on specialization, llvm const-prop + instcombine to remove unneeded masking + mskOR = oneBits(T, Val(N)) + mskAND = mskBits(T, Val(N)) + + s0, s1, s2, s3 = forkRand(rng, Val(N)) + + i = 0 + while i + 8*N <= len + res = _rotl23(_plus(s0,s3)) + res = _or(_and(res, mskAND), mskOR) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), res) + i += 8*N + end + return i +end + +@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}) where {N} + s0, s1, s2, s3 = forkRand(rng, Val(N)) + msk = ntuple(i->VecElement(0x0101010101010101), Val(N)) + i = 0 + while i + 64*N <= len + res = _rotl23(_plus(s0,s3)) + t = _shl17(s1) + s2 = _xor(s2, s0) + s3 = _xor(s3, s1) + s1 = _xor(s1, s2) + s0 = _xor(s0, s3) + s2 = _xor(s2, t) + s3 = _rotl45(s3) + for k=0:7 + tmp = _lshr(res, k) + toWrite = _and(tmp, msk) + unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite) + end + i += 64*N + end + return i +end + + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float32}, UnsafeView{Float32}}, ::SamplerTrivial{CloseOpen12{Float32}}) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth()) + @inbounds @simd for i = 1:length(dst) + dst[i] = dst[i] - 1.0f0 + end + dst +end + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float64}, UnsafeView{Float64}}, ::SamplerTrivial{CloseOpen12{Float64}}) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth()) + @inbounds @simd for i = 1:length(dst) + dst[i] = dst[i] - 1.0e0 + end + dst +end + +for T in BitInteger_types + @eval function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{$T}, UnsafeView{$T}}, ::SamplerType{$T}) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof($T), UInt8, xoshiroWidth()) + dst + end +end + +function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Bool}, UnsafeView{Bool}}, ::SamplerType{Bool}) + GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth()) + dst +end + +end # module diff --git a/test/threads_exec.jl b/test/threads_exec.jl index 860f0e03e2f5e8..14592ad2b73e47 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -874,3 +874,28 @@ end end @test sort!(collect(ys)) == 1:3 end + +# reproducible multi-threaded rand() + +using Random + +function reproducible_rand(r, i) + if i == 0 + return UInt64(0) + end + r1 = rand(r, UInt64)*hash(i) + t1 = Threads.@spawn reproducible_rand(r, i-1) + t2 = Threads.@spawn reproducible_rand(r, i-1) + r2 = rand(r, UInt64) + return r1 + r2 + fetch(t1) + fetch(t2) +end + +@testset "Task-local random" begin + r = Random.TaskLocal() + Random.seed!(r, 23) + val = reproducible_rand(r, 10) + for i = 1:4 + Random.seed!(r, 23) + @test reproducible_rand(r, 10) == val + end +end