From bce6397a15a4ce4a40269d498ef1afe14e660710 Mon Sep 17 00:00:00 2001
From: chethega <chethega@gmail.com>
Date: Sun, 23 Feb 2020 21:18:00 +0100
Subject: [PATCH] Introduce task-local and free-standing xoshiro RNG

---
 src/jltypes.c                    |  16 +-
 src/julia.h                      |   4 +
 src/task.c                       |  60 ++++++
 stdlib/Random/src/RNGs.jl        | 211 +++++++++++++++++++
 stdlib/Random/src/Random.jl      |   3 +-
 stdlib/Random/src/XoshiroSimd.jl | 344 +++++++++++++++++++++++++++++++
 test/threads_exec.jl             |  25 +++
 7 files changed, 658 insertions(+), 5 deletions(-)
 create mode 100644 stdlib/Random/src/XoshiroSimd.jl

diff --git a/src/jltypes.c b/src/jltypes.c
index 2ba69caf1991ed..00598504d5062e 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -2462,7 +2462,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         NULL,
                         jl_any_type,
                         jl_emptysvec,
-                        jl_perm_symsvec(10,
+                        jl_perm_symsvec(14,
                                         "next",
                                         "queue",
                                         "storage",
@@ -2472,8 +2472,12 @@ void jl_init_types(void) JL_GC_DISABLED
                                         "code",
                                         "_state",
                                         "sticky",
-                                        "_isexception"),
-                        jl_svec(10,
+                                        "_isexception",
+                                        "rngState0",
+                                        "rngState1",
+                                        "rngState2",
+                                        "rngState3"),
+                        jl_svec(14,
                                 jl_any_type,
                                 jl_any_type,
                                 jl_any_type,
@@ -2483,7 +2487,11 @@ void jl_init_types(void) JL_GC_DISABLED
                                 jl_any_type,
                                 jl_uint8_type,
                                 jl_bool_type,
-                                jl_bool_type),
+                                jl_bool_type,
+                                jl_uint64_type,
+                                jl_uint64_type,
+                                jl_uint64_type,
+                                jl_uint64_type),
                         0, 1, 6);
     jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type);
     jl_svecset(jl_task_type->types, 0, listt);
diff --git a/src/julia.h b/src/julia.h
index 1e0d96af669c73..a39949515a306c 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1809,6 +1809,10 @@ typedef struct _jl_task_t {
     uint8_t _state;
     uint8_t sticky; // record whether this Task can be migrated to a new thread
     uint8_t _isexception; // set if `result` is an exception to throw or that we exited with
+    uint64_t rngState0; // really rngState[4], but more convenient to split
+    uint64_t rngState1;
+    uint64_t rngState2;
+    uint64_t rngState3;
 
 // hidden state:
     // id of owning thread - does not need to be defined until the task runs
diff --git a/src/task.c b/src/task.c
index 4d38d30f5cc56e..da2662b7457efa 100644
--- a/src/task.c
+++ b/src/task.c
@@ -35,6 +35,7 @@
 #include "julia_internal.h"
 #include "threading.h"
 #include "julia_assert.h"
+#include "support/hashing.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -666,6 +667,63 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED)
     throw_internal(NULL);
 }
 
+/* This is xoshiro256++ 1.0, used for tasklocal random number generation in julia.
+   This implementation is intended for embedders and internal use by the runtime, and is
+   based on the reference implementation on http://prng.di.unimi.it
+
+   Credits go to Sebastiano Vigna for coming up with this PRNG.
+
+   There is a pure julia implementation in stdlib that tends to be faster when used from
+   within julia, due to inlining and more agressive architecture-specific optimizations.
+*/
+JL_DLLEXPORT uint64_t jl_tasklocal_genrandom(jl_task_t *task) JL_NOTSAFEPOINT
+{
+    uint64_t s0 = task->rngState0;
+    uint64_t s1 = task->rngState1;
+    uint64_t s2 = task->rngState2;
+    uint64_t s3 = task->rngState3;
+
+    uint64_t t = s0 << 17;
+    uint64_t tmp = s0 + s3;
+    uint64_t res = ((tmp << 23) | (tmp >> 41)) + s0;
+    s2 ^= s0;
+    s3 ^= s1;
+    s1 ^= s2;
+    s0 ^= s3;
+    s2 ^= t;
+    s3 = (s3 << 45) | (s3 >> 19);
+
+    task->rngState0 = s0;
+    task->rngState1 = s1;
+    task->rngState2 = s2;
+    task->rngState3 = s3;
+    return res;
+}
+
+void rng_split(jl_task_t *from, jl_task_t *to) JL_NOTSAFEPOINT
+{
+    /* TODO: consider a less ad-hoc construction
+       Ideally we could just use the output of the random stream to seed the initial
+       state of the child. Out of an overabundance of caution we multiply with
+       effectively random coefficients, to break possible self-interactions.
+
+       It is not the goal to mix bits -- we work under the assumption that the
+       source is well-seeded, and its output looks effectively random.
+       However, xoshiro has never been studied in the mode where we seed the
+       initial state with the output of another xoshiro instance.
+
+       Constants have nothing up their sleeve:
+       0x02011ce34bce797f == hash(UInt(1))|0x01
+       0x5a94851fb48a6e05 == hash(UInt(2))|0x01
+       0x3688cf5d48899fa7 == hash(UInt(3))|0x01
+       0x867b4bb4c42e5661 == hash(UInt(4))|0x01
+    */
+    to->rngState0 = 0x02011ce34bce797f * jl_tasklocal_genrandom(from);
+    to->rngState1 = 0x5a94851fb48a6e05 * jl_tasklocal_genrandom(from);
+    to->rngState2 = 0x3688cf5d48899fa7 * jl_tasklocal_genrandom(from);
+    to->rngState3 = 0x867b4bb4c42e5661 * jl_tasklocal_genrandom(from);
+}
+
 JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
@@ -701,6 +759,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion
     t->_isexception = 0;
     // Inherit logger state from parent task
     t->logstate = ptls->current_task->logstate;
+    // Fork task-local random state from parent
+    rng_split(ptls->current_task, t);
     // there is no active exception handler available on this stack yet
     t->eh = NULL;
     t->sticky = 1;
diff --git a/stdlib/Random/src/RNGs.jl b/stdlib/Random/src/RNGs.jl
index 5c29954f131321..6aede595e0415d 100644
--- a/stdlib/Random/src/RNGs.jl
+++ b/stdlib/Random/src/RNGs.jl
@@ -216,6 +216,216 @@ function reset_caches!(r::MersenneTwister)
     r
 end
 
+
+## Xoshiro RNG
+# Lots of implementation is shared with TaskLocal
+
+"""
+    Xoshiro
+
+Xoshiro256++ is a fast pseudorandom number generator originally developed by Sebastian Vigna.
+Reference implementation is available on on http://prng.di.unimi.it
+
+Apart from the high speed, Xoshiro has a small memory footprint, making it suitable for
+applications where many different random states need to be held for long time.
+
+Julia's Xoshiro implementation has a bulk-generation mode; this seeds new virtual PRNGs
+from the parent, and uses SIMD to generate in parallel (i.e. the bulk stream consists of
+multiple interleaved xoshiro instances).
+The virtual PRNGs are discarded once the bulk request has been serviced (and should cause
+no heap allocations).
+"""
+mutable struct Xoshiro <: AbstractRNG
+    s0::UInt64
+    s1::UInt64
+    s2::UInt64
+    s3::UInt64
+end
+
+Xoshiro(::Nothing) = Xoshiro()
+
+function Xoshiro(parent::AbstractRNG = RandomDevice())
+    # Constants have nothing up their sleeve, see task.c
+    # 0x02011ce34bce797f == hash(UInt(1))|0x01
+    # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
+    # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
+    # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
+
+    Xoshiro(0x02011ce34bce797f * rand(parent, UInt64),
+            0x5a94851fb48a6e05 * rand(parent, UInt64),
+            0x3688cf5d48899fa7 * rand(parent, UInt64),
+            0x867b4bb4c42e5661 * rand(parent, UInt64))
+end
+
+copy(rng::Xoshiro) = Xoshiro(rng.s0, rng.s1, rng.s2, rng.s3)
+
+function copy!(dst::Xoshiro, src::Xoshiro)
+    dst.s0, dst.s1, dst.s2, dst.s3 = src.s0, src.s1, src.s2, src.s3
+    dst
+end
+
+function ==(a::Xoshiro, b::Xoshiro)
+    a.s0 == b.s0 && a.s1 == b.s1 && a.s2 == b.s2 && a.s3 == b.s3
+end
+
+rng_native_52(::Xoshiro) = UInt64
+
+function seed!(rng::Xoshiro, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64)
+    # see task.c
+    s = Base.hash_uint64(s0)
+    rng.s0 = s
+    s += Base.hash_uint64(s1)
+    rng.s1 = s
+    s += Base.hash_uint64(s2)
+    rng.s2 = s
+    s += Base.hash_uint64(s3)
+    rng.s3 = s
+    rng
+end
+
+@inline function rand(rng::Xoshiro, ::SamplerType{UInt64})
+    s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3
+    tmp = s0 + s3
+    res = tmp << 23 | tmp >> 41
+    t = s1 << 17
+    s2 = xor(s2, s0)
+    s3 = xor(s3, s1)
+    s1 = xor(s1, s2)
+    s0 = xor(s0, s3)
+    s2 = xor(s2, t)
+    s3 = s3 << 45 | s3 >> 19
+    rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3
+    res
+end
+
+
+## Task local RNG
+
+"""
+    TaskLocal
+
+The TaskLocal RNG has state that is local to its task, not its thread.
+It is seeded upon task creation, from the state of its parent task.
+Therefore, task creation is an event that changes the parent's RNG state.
+
+As an upside, the TaskLocal RNG is pretty fast, and permits reproducible
+multithreaded simulations (barring race conditions), independent of scheduler
+decisions. As long as the number of threads is not used to make decisions on
+task creation, simulation results are also independent of the number of available
+threads / CPUs. The random stream should not depend on hardware specifics, up to
+endianness and possibly word size.
+
+Using or seeding the RNG of any other task than the one returned by `current_task()`
+is undefined behavior: it will work most of the time, and may sometimes fail silently.
+"""
+struct TaskLocal <: AbstractRNG end
+TaskLocal(::Nothing) = TaskLocal()
+rng_native_52(::TaskLocal) = UInt64
+
+function seed!(rng::TaskLocal, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64)
+    # TODO: Consider a less ad-hoc construction
+    # We can afford burning a handful of cycles here, and we don't want any
+    # surprises with respect to bad seeds / bad interactions.
+    t = current_task()
+    s = hash(s0)
+    t.rngState0 = s
+    s += hash(s1)
+    t.rngState1 = s
+    s += hash(s2)
+    t.rngState2 = s
+    s += hash(s3)
+    t.rngState3 = s
+    rand(rng, UInt64)
+    rand(rng, UInt64)
+    rand(rng, UInt64)
+    rand(rng, UInt64)
+    rng
+end
+
+@inline function rand(::TaskLocal, ::SamplerType{UInt64})
+    task = current_task()
+    s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
+    tmp = s0 + s3
+    res = tmp << 23 | tmp >> 41
+    t = s1 << 17
+    s2 = xor(s2, s0)
+    s3 = xor(s3, s1)
+    s1 = xor(s1, s2)
+    s0 = xor(s0, s3)
+    s2 = xor(s2, t)
+    s3 = s3 << 45 | s3 >> 19
+    task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
+    res
+end
+
+# Shared implementation between Xoshiro and TaskLocal -- seeding
+function seed!(rng::Union{TaskLocal, Xoshiro}, seed::UInt128)
+    seed0 = seed % UInt64
+    seed1 = (seed>>>64) % UInt64
+    seed!(rng, seed0, seed1, zero(UInt64), zero(UInt64))
+end
+seed!(rng::Union{TaskLocal, Xoshiro}, seed::Integer) = seed!(rng, seed%UInt64, zero(UInt64), zero(UInt64), zero(UInt64))
+seed!(rng::Union{TaskLocal, Xoshiro}, ::Nothing) = seed!(rng)
+
+seed!(rng::Union{TaskLocal, Xoshiro}) =
+    seed!(rng, rand(RandomDevice(), UInt64), rand(RandomDevice(), UInt64),
+          rand(RandomDevice(), UInt64), rand(RandomDevice(), UInt64))
+
+function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt64})
+    if length(seed) > 4
+        throw(ArgumentError("seed should have no more than 256 bits"))
+    end
+    seed0 = length(seed)>0 ? seed[1] : UInt64(0)
+    seed1 = length(seed)>1 ? seed[2] : UInt64(0)
+    seed2 = length(seed)>2 ? seed[3] : UInt64(0)
+    seed3 = length(seed)>3 ? seed[4] : UInt64(0)
+    seed!(rng, seed0, seed1, seed2, seed3)
+end
+
+function seed!(rng::Union{TaskLocal, Xoshiro}, seed::AbstractVector{UInt32})
+    if iseven(length(seed))
+        seed!(rng, reinterpret(UInt64, seed))
+    else
+        seed!(rng, UInt64[reinterpret(UInt64, @view(seed[begin:end-1])); seed[end] % UInt64])
+    end
+end
+
+@inline function rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{UInt128})
+    first = rand(rng, UInt64)
+    second = rand(rng,UInt64)
+    second + UInt128(first)<<64
+end
+
+@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128
+
+@inline rand(rng::Union{TaskLocal, Xoshiro}, ::SamplerType{T}) where {T<:Union{Bool, UInt8, Int8, UInt16, Int16, UInt32, Int32, Int64}} = rand(rng, UInt64) % T
+
+function copy(rng::TaskLocal)
+    t = current_task()
+    Xoshiro(t.rngState0, t.rngState1, t.rngState2, t.rngState3)
+end
+
+function copy!(dst::TaskLocal, src::Xoshiro)
+    t = current_task()
+    t.rngState0, t.rngState1, t.rngState2, t.rngState3 = src.s0, src.s1, src.s2, src.s3
+    dst
+end
+
+function copy!(dst::Xoshiro, src::TaskLocal)
+    t = current_task()
+    dst.s0, dst.s1, dst.s2, dst.s3 = t.rngState0, t.rngState1, t.rngState2, t.rngState3
+    dst
+end
+
+function ==(a::Xoshiro, b::TaskLocal)
+    t = current_task()
+    a.s0 == t.rngState0 && a.s1 == t.rngState1 && a.s2 == t.rngState2 && a.s3 == t.rngState3
+end
+
+==(a::TaskLocal, b::Xoshiro) = b == a
+
+### low level API
+
 #### floats
 
 mt_avail(r::MersenneTwister) = MT_CACHE_F - r.idxF
@@ -382,6 +592,7 @@ end
 
 function __init__()
     resize!(empty!(THREAD_RNGs), Threads.nthreads()) # ensures that we didn't save a bad object
+    seed!(TaskLocal())
 end
 
 
diff --git a/stdlib/Random/src/Random.jl b/stdlib/Random/src/Random.jl
index 2cdffd60672520..9ff79f6b4972ad 100644
--- a/stdlib/Random/src/Random.jl
+++ b/stdlib/Random/src/Random.jl
@@ -27,7 +27,7 @@ export rand!, randn!,
        shuffle, shuffle!,
        randperm, randperm!,
        randcycle, randcycle!,
-       AbstractRNG, MersenneTwister, RandomDevice
+       AbstractRNG, MersenneTwister, RandomDevice, TaskLocal, Xoshiro
 
 ## general definitions
 
@@ -296,6 +296,7 @@ include("RNGs.jl")
 include("generation.jl")
 include("normal.jl")
 include("misc.jl")
+include("XoshiroSimd.jl")
 
 ## rand & rand! & seed! docstrings
 
diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl
new file mode 100644
index 00000000000000..5237767b5c945b
--- /dev/null
+++ b/stdlib/Random/src/XoshiroSimd.jl
@@ -0,0 +1,344 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+module XoshiroSimd
+# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl.
+import ..Random: TaskLocal, rand, rand!, UnsafeView, Xoshiro, SamplerType, CloseOpen12, SamplerTrivial
+using Base: BitInteger_types
+
+# Vector-width. Influences random stream. We may want to tune this before merging.
+xoshiroWidth() = Val(8)
+# Simd threshold. Influences random stream. We may want to tune this before merging.
+simdThreshold(::Type{T}) where T = 2048
+simdThreshold(::Type{Bool}) = 4096
+
+@inline _rotl45(x::UInt64) = (x<<45)|(x>>19)
+@inline _shl17(x::UInt64) = x<<17
+@inline _rotl23(x::UInt64) = (x<<23)|(x>>41)
+@inline _plus(x::UInt64,y::UInt64) = x+y
+@inline _xor(x::UInt64,y::UInt64) = xor(x,y)
+@inline _and(x::UInt64, y::UInt64) = x & y
+@inline _or(x::UInt64, y::UInt64) = x | y
+@inline _lshr(x, y::Int32) = _lshr(x, y % Int64)
+@inline _lshr(x::UInt64, y::Int64) = Core.Intrinsics.llvmcall("""
+    %res = lshr i64 %0, %1
+    ret i64 %res
+    """,
+    UInt64,
+    Tuple{UInt64, Int64},
+    x, y)
+
+
+# required operations. These could be written more concise with `ntuple`, but the compiler
+# sometimes refuses to properly vectorize.
+for N in [1,2,4,8,16,32]
+    let
+        local code
+        code = """
+        %lshiftOp = shufflevector <1 x i64> <i64 45>, <1 x i64> undef, <$N x i32> zeroinitializer
+        %rshiftOp = shufflevector <1 x i64> <i64 19>, <1 x i64> undef, <$N x i32> zeroinitializer
+        %lshifted = shl <$N x i64> %0, %lshiftOp
+        %rshifted = lshr <$N x i64> %0, %rshiftOp
+        %res = or <$N x i64> %lshifted, %rshifted
+        ret <$N x i64> %res
+        """
+        @eval @inline _rotl45(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}},
+            x)
+
+        code = """
+        %lshiftOp = shufflevector <1 x i64> <i64 17>, <1 x i64> undef, <$N x i32> zeroinitializer
+        %res = shl <$N x i64> %0, %lshiftOp
+        ret <$N x i64> %res
+        """
+        @eval @inline _shl17(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}},
+            x)
+
+        code = """
+        %lshiftOp = shufflevector <1 x i64> <i64 23>, <1 x i64> undef, <$N x i32> zeroinitializer
+        %rshiftOp = shufflevector <1 x i64> <i64 41>, <1 x i64> undef, <$N x i32> zeroinitializer
+        %lshifted = shl <$N x i64> %0, %lshiftOp
+        %rshifted = lshr <$N x i64> %0, %rshiftOp
+        %res = or <$N x i64> %lshifted, %rshifted
+        ret <$N x i64> %res
+        """
+        @eval @inline _rotl23(x::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}},
+            x)
+
+        code = """
+        %res = add <$N x i64> %1, %0
+        ret <$N x i64> %res
+        """
+        @eval @inline _plus(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}},
+            x, y)
+
+        code = """
+        %res = xor <$N x i64> %1, %0
+        ret <$N x i64> %res
+        """
+        @eval @inline _xor(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}},
+            x, y)
+
+        code = """
+        %res = and <$N x i64> %1, %0
+        ret <$N x i64> %res
+        """
+        @eval @inline _and(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}},
+            x, y)
+
+        code = """
+        %res = or <$N x i64> %1, %0
+        ret <$N x i64> %res
+        """
+        @eval @inline _or(x::NTuple{$N, VecElement{UInt64}}, y::NTuple{$N, VecElement{UInt64}}) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}, NTuple{$N, VecElement{UInt64}}},
+            x, y)
+
+        code = """
+        %tmp = insertelement <1 x i64> undef, i64 %1, i32 0
+        %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer
+        %res = lshr <$N x i64> %0, %shift
+        ret <$N x i64> %res
+        """
+        @eval @inline _lshr(x::NTuple{$N, VecElement{UInt64}}, y::Int64) = Core.Intrinsics.llvmcall($code,
+            NTuple{$N, VecElement{UInt64}},
+            Tuple{NTuple{$N, VecElement{UInt64}}, Int64},
+            x, y)
+
+    end
+end
+
+
+# The mast values are used as res = _or(_and(res, mskBits), oneBits)
+
+mskBits(::Type{T}) where T = 0xffffffffffffffff
+mskBits(::Type{T}, ::Val{N}) where {T,N} =  ntuple(i-> VecElement(mskBits(T)), Val(N))
+
+oneBits(::Type{T}) where T =  0x0000000000000000
+oneBits(::Type{T}, ::Val{N}) where {T,N} =  ntuple(i-> VecElement(oneBits(T)), Val(N))
+
+mskBits(::Type{Float64})  = Base.significand_mask(Float64)
+oneBits(::Type{Float64}) = Base.exponent_one(Float64)
+
+mskBits(::Type{Float32}) = Base.significand_mask(Float32) + UInt64(Base.significand_mask(Float32))<<32
+oneBits(::Type{Float32}) = Base.exponent_one(Float32) + UInt64(Base.exponent_one(Float32))<<32
+
+
+function forkRand(rng::Union{TaskLocal, Xoshiro}, ::Val{N}) where N
+    # constants have nothing up their sleeve. For more discussion, cf rng_split in task.c
+    # 0x02011ce34bce797f == hash(UInt(1))|0x01
+    # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
+    # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
+    # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
+    s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N))
+    s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N))
+    s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N))
+    s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N))
+    (s0, s1, s2, s3)
+end
+
+@inline function xoshiro_bulk(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T<:Union{UInt8, Float32, Float64, Bool}, N}
+    if len >= simdThreshold(T)
+        written = xoshiro_bulk_simd(rng, dst, len, T, Val(N))
+        len -= written
+        dst += written
+    end
+    if len != 0
+        xoshiro_bulk_nosimd(rng, dst, len,  T)
+    end
+    nothing
+end
+
+@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}) where {T}
+    # we rely on specialization, llvm const-prop + instcombine to remove unneeded masking
+    mskOR =  oneBits(T)
+    mskAND = mskBits(T)
+    if rng isa TaskLocal
+        task = current_task()
+        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
+    else
+        rng::Xoshiro
+        s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3
+    end
+
+    i = 0
+    while i+8 <= len
+        res = _rotl23(_plus(s0,s3))
+        res = _or(_and(res, mskAND), mskOR)
+        unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res)
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+        i += 8
+    end
+    if i < len
+        res = _rotl23(_plus(s0,s3))
+        res = _or(_and(res, mskAND), mskOR)
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+        ref = Ref(res)
+        # TODO: This may make the random-stream dependent on system endianness
+        GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i)
+    end
+    if rng isa TaskLocal
+        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
+    else
+       rng.s0, rng.s1, rng.s2, rng.s3 =  s0, s1, s2, s3
+    end
+    nothing
+end
+
+@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool})
+    if rng isa TaskLocal
+        task = current_task()
+        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
+    else
+        rng::Xoshiro
+        s0, s1, s2, s3 = rng.s0, rng.s1, rng.s2, rng.s3
+    end
+
+    i = 0
+    while i+8 <= len
+        res = _rotl23(_plus(s0,s3))
+        shift = 0
+        while i+8 <= len && shift < 8
+            resLoc = _and(_lshr(res, shift), 0x0101010101010101)
+            unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), res)
+            i += 8
+            shift += 1
+        end
+
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+    end
+    if i < len
+        # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0
+        res = _rotl23(_plus(s0,s3))
+        while i+8 <= len
+            resLoc = _and(res, 0x0101010101010101)
+            res = _lshr(res, 1)
+            unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc)
+            i += 8
+        end
+        resLoc = _and(res, 0x0101010101010101)
+        ref = Ref(resLoc)
+        GC.@preserve ref ccall(:memcpy, Nothing, (Ptr{UInt8}, Ptr{Cvoid}, Csize_t), dst+i, pointer_from_objref(ref) , len-i)
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+    end
+    if rng isa TaskLocal
+        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
+    else
+        rng.s0, rng.s1, rng.s2, rng.s3 =  s0, s1, s2, s3
+    end
+    nothing
+end
+
+
+@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}) where {T,N}
+    # we rely on specialization, llvm const-prop + instcombine to remove unneeded masking
+    mskOR =  oneBits(T, Val(N))
+    mskAND = mskBits(T, Val(N))
+
+    s0, s1, s2, s3 = forkRand(rng, Val(N))
+
+    i = 0
+    while i + 8*N <= len
+        res = _rotl23(_plus(s0,s3))
+        res = _or(_and(res, mskAND), mskOR)
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+        unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), res)
+        i +=  8*N
+    end
+    return i
+end
+
+@noinline function xoshiro_bulk_simd(rng::Union{TaskLocal, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}) where {N}
+    s0, s1, s2, s3 = forkRand(rng, Val(N))
+    msk = ntuple(i->VecElement(0x0101010101010101), Val(N))
+    i = 0
+    while i + 64*N <= len
+        res = _rotl23(_plus(s0,s3))
+        t = _shl17(s1)
+        s2 = _xor(s2, s0)
+        s3 = _xor(s3, s1)
+        s1 = _xor(s1, s2)
+        s0 = _xor(s0, s3)
+        s2 = _xor(s2, t)
+        s3 = _rotl45(s3)
+        for k=0:7
+            tmp = _lshr(res, k)
+            toWrite = _and(tmp, msk)
+            unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite)
+        end
+        i +=  64*N
+    end
+    return i
+end
+
+
+function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float32}, UnsafeView{Float32}}, ::SamplerTrivial{CloseOpen12{Float32}})
+    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth())
+    @inbounds @simd for i = 1:length(dst)
+        dst[i] = dst[i] - 1.0f0
+    end
+    dst
+end
+
+function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Float64}, UnsafeView{Float64}}, ::SamplerTrivial{CloseOpen12{Float64}})
+    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth())
+    @inbounds @simd for i = 1:length(dst)
+        dst[i] = dst[i] - 1.0e0
+    end
+    dst
+end
+
+for T in BitInteger_types
+    @eval function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{$T}, UnsafeView{$T}}, ::SamplerType{$T})
+        GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof($T), UInt8, xoshiroWidth())
+        dst
+    end
+end
+
+function rand!(rng::Union{TaskLocal, Xoshiro}, dst::Union{Array{Bool}, UnsafeView{Bool}}, ::SamplerType{Bool})
+    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth())
+    dst
+end
+
+end # module
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index 860f0e03e2f5e8..14592ad2b73e47 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -874,3 +874,28 @@ end
     end
     @test sort!(collect(ys)) == 1:3
 end
+
+# reproducible multi-threaded rand()
+
+using Random
+
+function reproducible_rand(r, i)
+    if i == 0
+        return UInt64(0)
+    end
+    r1 = rand(r, UInt64)*hash(i)
+    t1 = Threads.@spawn reproducible_rand(r, i-1)
+    t2 = Threads.@spawn reproducible_rand(r, i-1)
+    r2 = rand(r, UInt64)
+    return r1 + r2 + fetch(t1) + fetch(t2)
+end
+
+@testset "Task-local random" begin
+    r = Random.TaskLocal()
+    Random.seed!(r, 23)
+    val = reproducible_rand(r, 10)
+    for i = 1:4
+        Random.seed!(r, 23)
+        @test reproducible_rand(r, 10) == val
+    end
+end