From 233bb0c81239a86705d6ea306f69617250b414f5 Mon Sep 17 00:00:00 2001
From: Ian Atol <ianatol@arctic1.juliacomputing.io>
Date: Wed, 29 Sep 2021 18:33:29 -0400
Subject: [PATCH] Implement maybecopy in BoundsError, start optimization,
 refactor memory_opt!

---
 base/abstractarray.jl           |   4 +
 base/array.jl                   |  53 +++-
 base/boot.jl                    |  13 +-
 base/broadcast.jl               |  13 +
 base/compiler/optimize.jl       |   1 +
 base/compiler/ssair/passes.jl   | 162 ++++++++++--
 base/pointer.jl                 |   1 +
 src/builtin_proto.h             |   1 +
 src/builtins.c                  |  14 +
 src/codegen.cpp                 |   4 +-
 src/staticdata.c                |   5 +-
 test/compiler/immutablearray.jl | 456 +++++++++++++++++++++++++++++++-
 12 files changed, 685 insertions(+), 42 deletions(-)

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index d5d47fe855bd5..20d49e0bfbcf3 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1073,6 +1073,10 @@ function copy(a::AbstractArray)
     copymutable(a)
 end
 
+function copy(a::Core.ImmutableArray)
+    a
+end
+
 function copyto!(B::AbstractVecOrMat{R}, ir_dest::AbstractRange{Int}, jr_dest::AbstractRange{Int},
                A::AbstractVecOrMat{S}, ir_src::AbstractRange{Int}, jr_src::AbstractRange{Int}) where {R,S}
     if length(ir_dest) != length(ir_src)
diff --git a/base/array.jl b/base/array.jl
index f38e2e10c08e2..dc22cc4a21cb0 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -118,6 +118,35 @@ Union type of [`DenseVector{T}`](@ref) and [`DenseMatrix{T}`](@ref).
 """
 const DenseVecOrMat{T} = Union{DenseVector{T}, DenseMatrix{T}}
 
+"""
+    ImmutableArray
+
+Dynamically allocated, immutable array.
+
+"""
+const ImmutableArray = Core.ImmutableArray
+
+"""
+    IMArray{T,N}
+
+Union type of [`Array{T,N}`](@ref) and [`ImmutableArray{T,N}`](@ref)
+"""
+const IMArray{T,N} = Union{Array{T, N}, ImmutableArray{T,N}}
+
+"""
+    IMVector{T}
+
+One-dimensional [`ImmutableArray`](@ref) or [`Array`](@ref) with elements of type `T`. Alias for `IMArray{T, 1}`.
+"""
+const IMVector{T} = IMArray{T, 1}
+
+"""
+    IMMatrix{T}
+
+Two-dimensional [`ImmutableArray`](@ref) or [`Array`](@ref) with elements of type `T`. Alias for `IMArray{T,2}`.
+"""
+const IMMatrix{T} = IMArray{T, 2}
+
 ## Basic functions ##
 
 import Core: arraysize, arrayset, arrayref, const_arrayref
@@ -147,14 +176,13 @@ function vect(X...)
     return copyto!(Vector{T}(undef, length(X)), X)
 end
 
-const ImmutableArray = Core.ImmutableArray
-const IMArray{T,N} = Union{Array{T, N}, ImmutableArray{T,N}}
-const IMVector{T} = IMArray{T, 1}
-const IMMatrix{T} = IMArray{T, 2}
-
+# Freeze and thaw constructors
 ImmutableArray(a::Array) = Core.arrayfreeze(a)
 Array(a::ImmutableArray) = Core.arraythaw(a)
 
+ImmutableArray(a::AbstractArray{T,N}) where {T,N} = ImmutableArray{T,N}(a)
+
+# Size functions for arrays, both mutable and immutable
 size(a::IMArray, d::Integer) = arraysize(a, convert(Int, d))
 size(a::IMVector) = (arraysize(a,1),)
 size(a::IMMatrix) = (arraysize(a,1), arraysize(a,2))
@@ -393,6 +421,18 @@ similar(a::Array{T}, m::Int) where {T}              = Vector{T}(undef, m)
 similar(a::Array, T::Type, dims::Dims{N}) where {N} = Array{T,N}(undef, dims)
 similar(a::Array{T}, dims::Dims{N}) where {T,N}     = Array{T,N}(undef, dims)
 
+ImmutableArray{T}(undef::UndefInitializer, m::Int) where T = ImmutableArray(Array{T}(undef, m))
+ImmutableArray{T}(undef::UndefInitializer, dims::Dims) where T = ImmutableArray(Array{T}(undef, dims))
+
+"""
+    maybecopy(x)
+
+`maybecopy` provides access to `x` while ensuring it does not escape.
+To do so, the optimizer decides whether to create a copy of `x` or not based on the implementation
+That is, `maybecopy` will either be a call to [`copy`](@ref) or just a reference to x.
+"""
+maybecopy = Core.maybecopy
+
 # T[x...] constructs Array{T,1}
 """
     getindex(type[, elements...])
@@ -626,8 +666,8 @@ oneunit(x::AbstractMatrix{T}) where {T} = _one(oneunit(T), x)
 
 ## Conversions ##
 
-convert(::Type{T}, a::AbstractArray) where {T<:Array} = a isa T ? a : T(a)
 convert(::Type{Union{}}, a::AbstractArray) = throw(MethodError(convert, (Union{}, a)))
+convert(T::Union{Type{<:Array},Type{<:Core.ImmutableArray}}, a::AbstractArray) = a isa T ? a : T(a)
 
 promote_rule(a::Type{Array{T,n}}, b::Type{Array{S,n}}) where {T,n,S} = el_same(promote_type(T,S), a, b)
 
@@ -637,6 +677,7 @@ if nameof(@__MODULE__) === :Base  # avoid method overwrite
 # constructors should make copies
 Array{T,N}(x::AbstractArray{S,N})         where {T,N,S} = copyto_axcheck!(Array{T,N}(undef, size(x)), x)
 AbstractArray{T,N}(A::AbstractArray{S,N}) where {T,N,S} = copyto_axcheck!(similar(A,T), A)
+ImmutableArray{T,N}(Ar::AbstractArray{S,N}) where {T,N,S} = Core.arrayfreeze(copyto_axcheck!(Array{T,N}(undef, size(Ar)), Ar))
 end
 
 ## copying iterators to containers
diff --git a/base/boot.jl b/base/boot.jl
index 98b8cf2e9cf40..d59af90d54f13 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -279,8 +279,17 @@ struct BoundsError <: Exception
     a::Any
     i::Any
     BoundsError() = new()
-    BoundsError(@nospecialize(a)) = (@_noinline_meta; new(a))
-    BoundsError(@nospecialize(a), i) = (@_noinline_meta; new(a,i))
+    # For now, always copy arrays to avoid escaping them
+    # Eventually, we want to figure out if the copy is needed to save the performance of copying
+    # (i.e., if a escapes elsewhere, don't bother to make a copy)
+
+    BoundsError(@nospecialize(a)) = a isa Array ?
+                                    (@_noinline_meta; new(Core.maybecopy(a))) :
+                                    (@_noinline_meta; new(a))
+
+    BoundsError(@nospecialize(a), i) = a isa Array ?
+                                       (@_noinline_meta; new(Core.maybecopy(a), i)) :
+                                       (@_noinline_meta; new(a, i))
 end
 struct DivideError         <: Exception end
 struct OutOfMemoryError    <: Exception end
diff --git a/base/broadcast.jl b/base/broadcast.jl
index b34a73041708b..83067ed46f5e4 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -1385,4 +1385,17 @@ function Base.show(io::IO, op::BroadcastFunction)
 end
 Base.show(io::IO, ::MIME"text/plain", op::BroadcastFunction) = show(io, op)
 
+struct IMArrayStyle <: Broadcast.AbstractArrayStyle{Any} end
+BroadcastStyle(::Type{<:Core.ImmutableArray}) = IMArrayStyle()
+
+#similar has to return mutable array
+function Base.similar(bc::Broadcasted{IMArrayStyle}, ::Type{ElType}) where ElType
+    similar(Array{ElType}, axes(bc))
+end
+
+@inline function copy(bc::Broadcasted{IMArrayStyle})
+    ElType = combine_eltypes(bc.f, bc.args)
+    return Core.ImmutableArray(copyto!(similar(bc, ElType), bc))
+end
+
 end # module
diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl
index 62f4e32b90aea..de6294393d3c1 100644
--- a/base/compiler/optimize.jl
+++ b/base/compiler/optimize.jl
@@ -307,6 +307,7 @@ function run_passes(ci::CodeInfo, sv::OptimizationState)
     ir = adce_pass!(ir)
     #@Base.show ("after_adce", ir)
     @timeit "type lift" ir = type_lift_pass!(ir)
+    #@timeit "compact 3" ir = compact!(ir)
     ir = memory_opt!(ir)
     #@Base.show ir
     if JLOptions().debug_level == 2
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 54d4d46b2c5ab..723776f49fe37 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -1256,76 +1256,184 @@ function cfg_simplify!(ir::IRCode)
     return finish(compact)
 end
 
-function is_allocation(stmt)
+# function is_known_fcall(stmt::Expr, @nospecialize(func))
+#     isexpr(stmt, :foreigncall) || return false
+#     s = stmt.args[1]
+#     isa(s, QuoteNode) && (s = s.value)
+#     return s === func
+# end
+
+function is_known_fcall(stmt::Expr, funcs::Vector{Symbol})
     isexpr(stmt, :foreigncall) || return false
     s = stmt.args[1]
     isa(s, QuoteNode) && (s = s.value)
-    return s === :jl_alloc_array_1d
+    # return any(e -> s === e, funcs)
+    return true in map(e -> s === e, funcs)
+end
+
+function is_allocation(stmt::Expr)
+    isexpr(stmt, :foreigncall) || return false
+    s = stmt.args[1]
+    isa(s, QuoteNode) && (s = s.value)
+    return (s === :jl_alloc_array_1d
+         || s === :jl_alloc_array_2d
+         || s ===  :jl_alloc_array_3d
+         || s === :jl_new_array)
 end
 
 function memory_opt!(ir::IRCode)
     compact = IncrementalCompact(ir, false)
     uses = IdDict{Int, Vector{Int}}()
-    relevant = IdSet{Int}()
-    revisit = Int[]
-    function mark_val(val)
+    relevant = IdSet{Int}() # allocations
+    revisit = Int[] # potential targets for a mutating_arrayfreeze drop-in
+    maybecopies = Int[] # calls to maybecopy
+
+    function mark_escape(@nospecialize val)
         isa(val, SSAValue) || return
+        #println(val.id, " escaped.")
         val.id in relevant && pop!(relevant, val.id)
     end
+
+    function mark_use(val, idx)
+        isa(val, SSAValue) || return
+        id = val.id
+        id in relevant || return
+        (haskey(uses, id)) || (uses[id] = Int[])
+        push!(uses[id], idx)
+    end
+
     for ((_, idx), stmt) in compact
+
+        #println("idx: ", idx, " = ", stmt)
+
         if isa(stmt, ReturnNode)
             isdefined(stmt, :val) || continue
             val = stmt.val
-            if isa(val, SSAValue) && val.id in relevant
-                (haskey(uses, val.id)) || (uses[val.id] = Int[])
-                push!(uses[val.id], idx)
-            end
+            mark_use(val, idx)
             continue
+
+        # check for phinodes that are possibly allocations
+        elseif isa(stmt, PhiNode)
+
+            # ensure all of the phinode values are defined
+            defined = true
+            for i = 1:length(stmt.values)
+                if !isassigned(stmt.values, i)
+                    defined = false
+                end
+            end
+
+            defined || continue
+
+            for val in stmt.values
+                if isa(val, SSAValue) && val.id in relevant
+                    push!(relevant, idx)
+                end
+            end
         end
+
         (isexpr(stmt, :call) || isexpr(stmt, :foreigncall)) || continue
+
+        if is_known_call(stmt, Core.maybecopy, compact)
+            push!(maybecopies, idx)
+            continue
+        end
+
         if is_allocation(stmt)
             push!(relevant, idx)
             # TODO: Mark everything else here
             continue
         end
-        # TODO: Replace this by interprocedural escape analysis
-        if is_known_call(stmt, arrayset, compact)
+
+        if is_known_call(stmt, arrayset, compact) && length(stmt.args) >= 5
             # The value being set escapes, everything else doesn't
-            mark_val(stmt.args[4])
+            mark_escape(stmt.args[4])
             arr = stmt.args[3]
-            if isa(arr, SSAValue) && arr.id in relevant
-                (haskey(uses, arr.id)) || (uses[arr.id] = Int[])
-                push!(uses[arr.id], idx)
-            end
+            mark_use(arr, idx)
+
+        elseif is_known_call(stmt, arrayref, compact) && length(stmt.args) == 4
+            arr = stmt.args[3]
+            mark_use(arr, idx)
+
+        elseif is_known_call(stmt, setindex!, compact) && length(stmt.args) == 4
+            # handle similarly to arrayset
+            val = stmt.args[3]
+            mark_escape(val)
+
+            arr = stmt.args[2]
+            mark_use(arr, idx)
+
+        elseif is_known_call(stmt, (===), compact) && length(stmt.args) == 3
+            arr1 = stmt.args[2]
+            arr2 = stmt.args[3]
+
+            mark_use(arr1, idx)
+            mark_use(arr2, idx)
+
+        # these foreigncalls have similar structure and don't escape our array, so handle them all at once
+        elseif is_known_fcall(stmt, [:jl_array_ptr, :jl_array_copy]) && length(stmt.args) == 6
+            arr = stmt.args[6]
+            mark_use(arr, idx)
+
+        elseif is_known_call(stmt, arraysize, compact) && isa(stmt.args[2], SSAValue)
+            arr = stmt.args[2]
+            mark_use(arr, idx)
+
         elseif is_known_call(stmt, Core.arrayfreeze, compact) && isa(stmt.args[2], SSAValue)
+            # mark these for potential replacement with mutating_arrayfreeze
             push!(revisit, idx)
+
         else
-            # For now we assume everything escapes
-            # TODO: We could handle PhiNodes specially and improve this
+            # Assume everything else escapes
             for ur in userefs(stmt)
-                mark_val(ur[])
+                mark_escape(ur[])
             end
         end
     end
+
     ir = finish(compact)
-    isempty(revisit) && return ir
+    isempty(revisit) && isempty(maybecopies) && return ir
+
     domtree = construct_domtree(ir.cfg.blocks)
+
     for idx in revisit
         # Make sure that the value we reference didn't escape
-        id = ir.stmts[idx][:inst].args[2].id
+        stmt = ir.stmts[idx][:inst]::Expr
+        id = (stmt.args[2]::SSAValue).id
         (id in relevant) || continue
 
+        #println("Revisiting ", stmt)
+
         # We're ok to steal the memory if we don't dominate any uses
         ok = true
-        for use in uses[id]
-            if ssadominates(ir, domtree, idx, use)
-                ok = false
-                break
+        if haskey(uses, id)
+            for use in uses[id]
+                if ssadominates(ir, domtree, idx, use)
+                    ok = false
+                    break
+                end
             end
         end
         ok || continue
-
-        ir.stmts[idx][:inst].args[1] = Core.mutating_arrayfreeze
+        stmt.args[1] = Core.mutating_arrayfreeze
     end
+
+    # TODO: Use escape analysis info to determine if maybecopy should copy
+
+    # for idx in maybecopies
+    #     stmt = ir.stmts[idx][:inst]::Expr
+    #     #println(stmt.args)
+    #     arr = stmt.args[2]
+    #     id = isa(arr, SSAValue) ? arr.id : arr.n # SSAValue or Core.Argument
+
+    #     if (id in relevant) # didn't escape elsewhere, so make a copy to keep it un-escaped
+    #         #println("didn't escape maybecopy")
+    #         stmt.args[1] = Main.Base.copy
+    #     else # already escaped, so save the cost of copying and just pass the actual object
+    #         #println("escaped maybecopy")
+    #         ir.stmts[idx][:inst] = arr
+    #     end
+    # end
+
     return ir
 end
diff --git a/base/pointer.jl b/base/pointer.jl
index b315e589ffd9a..2c526a7063b2e 100644
--- a/base/pointer.jl
+++ b/base/pointer.jl
@@ -63,6 +63,7 @@ cconvert(::Type{Ptr{UInt8}}, s::AbstractString) = String(s)
 cconvert(::Type{Ptr{Int8}}, s::AbstractString) = String(s)
 
 unsafe_convert(::Type{Ptr{T}}, a::Array{T}) where {T} = ccall(:jl_array_ptr, Ptr{T}, (Any,), a)
+unsafe_convert(::Type{Ptr{T}}, a::Core.ImmutableArray{T}) where {T} = ccall(:jl_array_ptr, Ptr{T}, (Any,), a)
 unsafe_convert(::Type{Ptr{S}}, a::AbstractArray{T}) where {S,T} = convert(Ptr{S}, unsafe_convert(Ptr{T}, a))
 unsafe_convert(::Type{Ptr{T}}, a::AbstractArray{T}) where {T} = error("conversion to pointer not defined for $(typeof(a))")
 
diff --git a/src/builtin_proto.h b/src/builtin_proto.h
index f781658ed55f9..b04fc1c671b14 100644
--- a/src/builtin_proto.h
+++ b/src/builtin_proto.h
@@ -54,6 +54,7 @@ DECLARE_BUILTIN(_typevar);
 DECLARE_BUILTIN(arrayfreeze);
 DECLARE_BUILTIN(arraythaw);
 DECLARE_BUILTIN(mutating_arrayfreeze);
+DECLARE_BUILTIN(maybecopy);
 
 JL_CALLABLE(jl_f_invoke_kwsorter);
 JL_CALLABLE(jl_f__structtype);
diff --git a/src/builtins.c b/src/builtins.c
index 7bccc23cd56df..be9fb45f2226a 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -1395,6 +1395,19 @@ JL_CALLABLE(jl_f_arrayset)
     return args[1];
 }
 
+JL_CALLABLE(jl_f_maybecopy)
+{
+    // maybecopy --- this builtin is never actually supposed to be executed
+    // instead, calls to it are analyzed and replaced with either a call to copy
+    // or directly replaced with the object itself that is the target of the maybecopy
+    // therefore, we just check that there is one argument and do a no-op
+    JL_NARGSV(maybecopy, 1);
+    JL_TYPECHK(maybecopy, array, args[0]);
+    jl_array_t *a = (jl_array_t*)args[0];
+    jl_array_t *na = jl_array_copy(a);
+    return (jl_value_t*)na;
+}
+
 // type definition ------------------------------------------------------------
 
 JL_CALLABLE(jl_f__structtype)
@@ -1877,6 +1890,7 @@ void jl_init_primitives(void) JL_GC_DISABLED
     add_builtin_func("_setsuper!", jl_f__setsuper);
     jl_builtin__typebody = add_builtin_func("_typebody!", jl_f__typebody);
     add_builtin_func("_equiv_typedef", jl_f__equiv_typedef);
+    jl_builtin_maybecopy = add_builtin_func("maybecopy", jl_f_maybecopy);
 
     // builtin types
     add_builtin("Any", (jl_value_t*)jl_any_type);
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 127e4a5cf8adc..bca74ca6a5fe3 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -663,12 +663,13 @@ static const auto jl_newbits_func = new JuliaFunction{
 // `julia.typeof` does read memory, but it is effectively readnone before we lower
 // the allocation function. This is OK as long as we lower `julia.typeof` no later than
 // `julia.gc_alloc_obj`.
+// Updated to argmemonly due to C++ deconstructor style usage in jl_f_arrayfreeze / mutating_arrayfreeze
 static const auto jl_typeof_func = new JuliaFunction{
     "julia.typeof",
     [](LLVMContext &C) { return FunctionType::get(T_prjlvalue,
                 {T_prjlvalue}, false); },
     [](LLVMContext &C) { return AttributeList::get(C,
-            Attributes(C, {Attribute::ReadNone, Attribute::NoUnwind, Attribute::NoRecurse}),
+            Attributes(C, {Attribute::ArgMemOnly, Attribute::NoUnwind, Attribute::NoRecurse}),
             Attributes(C, {Attribute::NonNull}),
             None); },
 };
@@ -906,6 +907,7 @@ static const std::map<jl_fptr_args_t, JuliaFunction*> builtin_func_map = {
     { &jl_f_arrayfreeze,        new JuliaFunction{"jl_f_arrayfreeze", get_func_sig, get_func_attrs} },
     { &jl_f_arraythaw,          new JuliaFunction{"jl_f_arraythaw", get_func_sig, get_func_attrs} },
     { &jl_f_mutating_arrayfreeze,new JuliaFunction{"jl_f_mutating_arrayfreeze", get_func_sig, get_func_attrs} },
+    { &jl_f_maybecopy,          new JuliaFunction{"jl_f_maybecopy", get_func_sig, get_func_attrs} },
 };
 
 static const auto jl_new_opaque_closure_jlcall_func = new JuliaFunction{"jl_new_opaque_closure_jlcall", get_func_sig, get_func_attrs};
diff --git a/src/staticdata.c b/src/staticdata.c
index 53e83a0aa2df9..8be7d95f2e717 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -30,7 +30,7 @@ extern "C" {
 // TODO: put WeakRefs on the weak_refs list during deserialization
 // TODO: handle finalizers
 
-#define NUM_TAGS    155
+#define NUM_TAGS    156
 
 // An array of references that need to be restored from the sysimg
 // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C.
@@ -205,6 +205,7 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_builtin_arrayfreeze);
         INSERT_TAG(jl_builtin_mutating_arrayfreeze);
         INSERT_TAG(jl_builtin_arraythaw);
+        INSERT_TAG(jl_builtin_maybecopy);
 
         // All optional tags must be placed at the end, so that we
         // don't accidentally have a `NULL` in the middle
@@ -255,7 +256,7 @@ static const jl_fptr_args_t id_to_fptrs[] = {
     &jl_f_applicable, &jl_f_invoke, &jl_f_sizeof, &jl_f__expr, &jl_f__typevar,
     &jl_f_ifelse, &jl_f__structtype, &jl_f__abstracttype, &jl_f__primitivetype,
     &jl_f__typebody, &jl_f__setsuper, &jl_f__equiv_typedef, &jl_f_opaque_closure_call,
-    &jl_f_arrayfreeze, &jl_f_mutating_arrayfreeze, &jl_f_arraythaw,
+    &jl_f_arrayfreeze, &jl_f_mutating_arrayfreeze, &jl_f_arraythaw, &jl_f_maybecopy,
     NULL
 };
 
diff --git a/test/compiler/immutablearray.jl b/test/compiler/immutablearray.jl
index 474e5dfc0f657..d4dc0d85da3a5 100644
--- a/test/compiler/immutablearray.jl
+++ b/test/compiler/immutablearray.jl
@@ -1,12 +1,460 @@
 using Base.Experimental: ImmutableArray
-function simple()
+using Test
+
+function test_allocate1()
     a = Vector{Float64}(undef, 5)
     for i = 1:5
         a[i] = i
     end
-    ImmutableArray(a)
+    Core.ImmutableArray(a)
 end
+
+function test_allocate2()
+    a = [1,2,3,4,5]
+    Core.ImmutableArray(a)
+end
+
+function test_allocate3()
+    a = Matrix{Float64}(undef, 5, 2)
+    for i = 1:5
+        for j = 1:2
+            a[i, j] = i + j
+        end
+    end
+    Core.ImmutableArray(a)
+end
+
+function test_allocate4()
+    a = Core.ImmutableArray{Float64}(undef, 5)
+end
+
+function test_broadcast1()
+    a = Core.ImmutableArray([1,2,3])
+    typeof(a .+ a) <: Core.ImmutableArray
+end
+
+function test_allocate5() # test that throwing boundserror doesn't escape
+    a = [1,2,3]
+    try
+        getindex(a, 4)
+    catch end
+    Core.ImmutableArray(a)
+end
+
+# function test_maybecopy1()
+#     a = Vector{Int64}(undef, 5)
+#     b = Core.maybecopy(a) # doesn't escape in this function - so a !=== b
+#     @test !(a === b)
+# end
+
+# function test_maybecopy2()
+#     a = Vector{Int64}(undef, 5)
+#     try
+#         a[6]
+#     catch e
+#         @test !(e.a === a)
+#     end
+# end
+
+# function test_maybecopy3()
+#     @noinline function escaper(arr)
+#         return arr
+#     end
+
+#     a = Vector{Int64}(undef, 5)
+#     escaper(a)
+#     b = Core.maybecopy(a)
+#     @test a === b # this time, it does escape, so we give back the actual object
+# end
+
+# function test_maybecopy4()
+#     @noinline function escaper(arr)
+#         return arr
+#     end
+
+#     a = Vector{Int64}(undef, 5)
+#     escaper(a)
+#     try
+#         a[6]
+#     catch e
+#         if isa(e, BoundsError)
+#             @test e.a === a # already escaped so we don't copy
+#         end
+#     end
+# end
+
+
+
+
 let
-    @allocated(simple())
-    @test @allocated(simple()) < 100
+    # warmup for @allocated
+    a,b,c,d,e = test_allocate1(), test_allocate2(), test_allocate3(), test_allocate4(), test_allocate5()
+
+    # these magic values are ~ what the mutable array version would allocate
+    @test @allocated(test_allocate1()) < 100
+    @test @allocated(test_allocate2()) < 100
+    @test @allocated(test_allocate3()) < 150
+    @test @allocated(test_allocate4()) < 100
+    @test @allocated(test_allocate5()) < 170
+    @test test_broadcast1() == true
+
+    # test_maybecopy1()
+    # test_maybecopy2()
+    # test_maybecopy3()
+    # test_maybecopy4()
 end
+
+
+# DiffEq Performance Tests
+
+# using DifferentialEquations
+# using StaticArrays
+
+# function _build_atsit5_caches(::Type{T}) where {T}
+
+#     cs = SVector{6, T}(0.161, 0.327, 0.9, 0.9800255409045097, 1.0, 1.0)
+
+#     as = SVector{21, T}(
+#         #=a21=# convert(T,0.161),
+#         #=a31=# convert(T,-0.008480655492356989),
+#         #=a32=# convert(T,0.335480655492357),
+#         #=a41=# convert(T,2.8971530571054935),
+#         #=a42=# convert(T,-6.359448489975075),
+#         #=a43=# convert(T,4.3622954328695815),
+#         #=a51=# convert(T,5.325864828439257),
+#         #=a52=# convert(T,-11.748883564062828),
+#         #=a53=# convert(T,7.4955393428898365),
+#         #=a54=# convert(T,-0.09249506636175525),
+#         #=a61=# convert(T,5.86145544294642),
+#         #=a62=# convert(T,-12.92096931784711),
+#         #=a63=# convert(T,8.159367898576159),
+#         #=a64=# convert(T,-0.071584973281401),
+#         #=a65=# convert(T,-0.028269050394068383),
+#         #=a71=# convert(T,0.09646076681806523),
+#         #=a72=# convert(T,0.01),
+#         #=a73=# convert(T,0.4798896504144996),
+#         #=a74=# convert(T,1.379008574103742),
+#         #=a75=# convert(T,-3.290069515436081),
+#         #=a76=# convert(T,2.324710524099774)
+#     )
+
+#     btildes = SVector{7,T}(
+#                 convert(T,-0.00178001105222577714),
+#                 convert(T,-0.0008164344596567469),
+#                 convert(T,0.007880878010261995),
+#                 convert(T,-0.1447110071732629),
+#                 convert(T,0.5823571654525552),
+#                 convert(T,-0.45808210592918697),
+#                 convert(T,0.015151515151515152)
+#     )
+#     rs = SVector{22, T}(
+#         #=r11=# convert(T,1.0),
+#         #=r12=# convert(T,-2.763706197274826),
+#         #=r13=# convert(T,2.9132554618219126),
+#         #=r14=# convert(T,-1.0530884977290216),
+#         #=r22=# convert(T,0.13169999999999998),
+#         #=r23=# convert(T,-0.2234),
+#         #=r24=# convert(T,0.1017),
+#         #=r32=# convert(T,3.9302962368947516),
+#         #=r33=# convert(T,-5.941033872131505),
+#         #=r34=# convert(T,2.490627285651253),
+#         #=r42=# convert(T,-12.411077166933676),
+#         #=r43=# convert(T,30.33818863028232),
+#         #=r44=# convert(T,-16.548102889244902),
+#         #=r52=# convert(T,37.50931341651104),
+#         #=r53=# convert(T,-88.1789048947664),
+#         #=r54=# convert(T,47.37952196281928),
+#         #=r62=# convert(T,-27.896526289197286),
+#         #=r63=# convert(T,65.09189467479366),
+#         #=r64=# convert(T,-34.87065786149661),
+#         #=r72=# convert(T,1.5),
+#         #=r73=# convert(T,-4),
+#         #=r74=# convert(T,2.5),
+#     )
+#     return cs, as, btildes, rs
+# end
+
+# function test_imarrays()
+#     function lorenz(u, p, t)
+#         a,b,c = u
+#         x,y,z = p
+#         dx_dt = x * (b - a)
+#         dy_dt = a*(y - c) - b
+#         dz_dt = a*b - z * c
+#         res = Vector{Float64}(undef, 3)
+#         res[1], res[2], res[3] = dx_dt, dy_dt, dz_dt
+#         Core.ImmutableArray(res)
+#     end
+
+#     _u0 = Core.ImmutableArray([1.0, 1.0, 1.0])
+#     _tspan = (0.0, 100.0)
+#     _p = (10.0, 28.0, 8.0/3.0)
+#     prob = ODEProblem(lorenz, _u0, _tspan, _p)
+
+#     u0 = prob.u0
+#     tspan = prob.tspan
+#     f = prob.f
+#     p = prob.p
+
+#     dt = 0.1f0
+#     saveat = nothing
+#     save_everystep = true
+#     abstol = 1f-6
+#     reltol = 1f-3
+
+#     t = tspan[1]
+#     tf = prob.tspan[2]
+
+#     beta1 = 7/50
+#     beta2 = 2/25
+#     qmax = 10.0
+#     qmin = 1/5
+#     gamma = 9/10
+#     qoldinit = 1e-4
+
+#     if saveat === nothing
+#         ts = Vector{eltype(dt)}(undef,1)
+#         ts[1] = prob.tspan[1]
+#         us = Vector{typeof(u0)}(undef,0)
+#         push!(us,recursivecopy(u0))
+#     else
+#         ts = saveat
+#         cur_t = 1
+#         us = MVector{length(ts),typeof(u0)}(undef)
+#         if prob.tspan[1] == ts[1]
+#             cur_t += 1
+#             us[1] = u0
+#         end
+#     end
+
+#     u = u0
+#     qold = 1e-4
+#     k7 = f(u, p, t)
+
+#     cs, as, btildes, rs = _build_atsit5_caches(eltype(u0))
+#     c1, c2, c3, c4, c5, c6 = cs
+#     a21, a31, a32, a41, a42, a43, a51, a52, a53, a54,
+#     a61, a62, a63, a64, a65, a71, a72, a73, a74, a75, a76 = as
+#     btilde1, btilde2, btilde3, btilde4, btilde5, btilde6, btilde7 = btildes
+
+#     # FSAL
+#     while t < tspan[2]
+#         uprev = u
+#         k1 = k7
+#         EEst = Inf
+
+#         while EEst > 1
+#             dt < 1e-14 && error("dt<dtmin")
+
+#             tmp = uprev+dt*a21*k1
+#             k2 = f(tmp, p, t+c1*dt)
+#             tmp = uprev+dt*(a31*k1+a32*k2)
+#             k3 = f(tmp, p, t+c2*dt)
+#             tmp = uprev+dt*(a41*k1+a42*k2+a43*k3)
+#             k4 = f(tmp, p, t+c3*dt)
+#             tmp = uprev+dt*(a51*k1+a52*k2+a53*k3+a54*k4)
+#             k5 = f(tmp, p, t+c4*dt)
+#             tmp = uprev+dt*(a61*k1+a62*k2+a63*k3+a64*k4+a65*k5)
+#             k6 = f(tmp, p, t+dt)
+#             u = uprev+dt*(a71*k1+a72*k2+a73*k3+a74*k4+a75*k5+a76*k6)
+#             k7 = f(u, p, t+dt)
+
+#             tmp = dt*(btilde1*k1+btilde2*k2+btilde3*k3+btilde4*k4+
+#                         btilde5*k5+btilde6*k6+btilde7*k7)
+#             tmp = tmp./(abstol.+max.(abs.(uprev),abs.(u))*reltol)
+#             EEst = DiffEqBase.ODE_DEFAULT_NORM(tmp, t)
+
+#             if iszero(EEst)
+#                 q = inv(qmax)
+#             else
+#                 @fastmath q11 = EEst^beta1
+#                 @fastmath q = q11/(qold^beta2)
+#             end
+
+#             if EEst > 1
+#                 dt = dt/min(inv(qmin),q11/gamma)
+#             else # EEst <= 1
+#                 @fastmath q = max(inv(qmax),min(inv(qmin),q/gamma))
+#                 qold = max(EEst,qoldinit)
+#                 dtold = dt
+#                 dt = dt/q #dtnew
+#                 dt = min(abs(dt),abs(tf-t-dtold))
+#                 told = t
+
+#                 if (tf - t - dtold) < 1e-14
+#                     t = tf
+#                 else
+#                     t += dtold
+#                 end
+
+#                 if saveat === nothing && save_everystep
+#                     push!(us,recursivecopy(u))
+#                     push!(ts,t)
+#                 else saveat !== nothing
+#                     while cur_t <= length(ts) && ts[cur_t] <= t
+#                         savet = ts[cur_t]
+#                         θ = (savet - told)/dtold
+#                         b1θ, b2θ, b3θ, b4θ, b5θ, b6θ, b7θ = bθs(rs, θ)
+#                         us[cur_t] = uprev + dtold*(
+#                             b1θ*k1 + b2θ*k2 + b3θ*k3 + b4θ*k4 + b5θ*k5 + b6θ*k6 + b7θ*k7)
+#                         cur_t += 1
+#                     end
+#                 end
+#             end
+#         end
+#     end
+
+#     if saveat === nothing && !save_everystep
+#         push!(us,u)
+#         push!(ts,t)
+#     end
+
+#     sol = DiffEqBase.build_solution(prob,Tsit5(),ts,us,calculate_error = false)
+
+#     DiffEqBase.has_analytic(prob.f) && DiffEqBase.calculate_solution_errors!(sol;timeseries_errors=true,dense_errors=false)
+
+#     sol
+# end
+
+# function test_marrays()
+#     function lorenz(u, p, t)
+#         a,b,c = u
+#         x,y,z = p
+#         dx_dt = x * (b - a)
+#         dy_dt = a*(y - c) - b
+#         dz_dt = a*b - z * c
+#         res = Vector{Float64}(undef, 3)
+#         res[1], res[2], res[3] = dx_dt, dy_dt, dz_dt
+#         res
+#     end
+
+#     _u0 = [1.0, 1.0, 1.0]
+#     _tspan = (0.0, 100.0)
+#     _p = (10.0, 28.0, 8.0/3.0)
+#     prob = ODEProblem(lorenz, _u0, _tspan, _p)
+
+#     u0 = prob.u0
+#     tspan = prob.tspan
+#     f = prob.f
+#     p = prob.p
+
+#     dt = 0.1f0
+#     saveat = nothing
+#     save_everystep = true
+#     abstol = 1f-6
+#     reltol = 1f-3
+
+#     t = tspan[1]
+#     tf = prob.tspan[2]
+
+#     beta1 = 7/50
+#     beta2 = 2/25
+#     qmax = 10.0
+#     qmin = 1/5
+#     gamma = 9/10
+#     qoldinit = 1e-4
+
+#     if saveat === nothing
+#         ts = Vector{eltype(dt)}(undef,1)
+#         ts[1] = prob.tspan[1]
+#         us = Vector{typeof(u0)}(undef,0)
+#         push!(us,recursivecopy(u0))
+#     else
+#         ts = saveat
+#         cur_t = 1
+#         us = MVector{length(ts),typeof(u0)}(undef)
+#         if prob.tspan[1] == ts[1]
+#             cur_t += 1
+#             us[1] = u0
+#         end
+#     end
+
+#     u = u0
+#     qold = 1e-4
+#     k7 = f(u, p, t)
+
+#     cs, as, btildes, rs = _build_atsit5_caches(eltype(u0))
+#     c1, c2, c3, c4, c5, c6 = cs
+#     a21, a31, a32, a41, a42, a43, a51, a52, a53, a54,
+#     a61, a62, a63, a64, a65, a71, a72, a73, a74, a75, a76 = as
+#     btilde1, btilde2, btilde3, btilde4, btilde5, btilde6, btilde7 = btildes
+
+#     # FSAL
+#     while t < tspan[2]
+#         uprev = u
+#         k1 = k7
+#         EEst = Inf
+
+#         while EEst > 1
+#             dt < 1e-14 && error("dt<dtmin")
+
+#             tmp = uprev+dt*a21*k1
+#             k2 = f(tmp, p, t+c1*dt)
+#             tmp = uprev+dt*(a31*k1+a32*k2)
+#             k3 = f(tmp, p, t+c2*dt)
+#             tmp = uprev+dt*(a41*k1+a42*k2+a43*k3)
+#             k4 = f(tmp, p, t+c3*dt)
+#             tmp = uprev+dt*(a51*k1+a52*k2+a53*k3+a54*k4)
+#             k5 = f(tmp, p, t+c4*dt)
+#             tmp = uprev+dt*(a61*k1+a62*k2+a63*k3+a64*k4+a65*k5)
+#             k6 = f(tmp, p, t+dt)
+#             u = uprev+dt*(a71*k1+a72*k2+a73*k3+a74*k4+a75*k5+a76*k6)
+#             k7 = f(u, p, t+dt)
+
+#             tmp = dt*(btilde1*k1+btilde2*k2+btilde3*k3+btilde4*k4+
+#                         btilde5*k5+btilde6*k6+btilde7*k7)
+#             tmp = tmp./(abstol.+max.(abs.(uprev),abs.(u))*reltol)
+#             EEst = DiffEqBase.ODE_DEFAULT_NORM(tmp, t)
+
+#             if iszero(EEst)
+#                 q = inv(qmax)
+#             else
+#                 @fastmath q11 = EEst^beta1
+#                 @fastmath q = q11/(qold^beta2)
+#             end
+
+#             if EEst > 1
+#                 dt = dt/min(inv(qmin),q11/gamma)
+#             else # EEst <= 1
+#                 @fastmath q = max(inv(qmax),min(inv(qmin),q/gamma))
+#                 qold = max(EEst,qoldinit)
+#                 dtold = dt
+#                 dt = dt/q #dtnew
+#                 dt = min(abs(dt),abs(tf-t-dtold))
+#                 told = t
+
+#                 if (tf - t - dtold) < 1e-14
+#                     t = tf
+#                 else
+#                     t += dtold
+#                 end
+
+#                 if saveat === nothing && save_everystep
+#                     push!(us,recursivecopy(u))
+#                     push!(ts,t)
+#                 else saveat !== nothing
+#                     while cur_t <= length(ts) && ts[cur_t] <= t
+#                         savet = ts[cur_t]
+#                         θ = (savet - told)/dtold
+#                         b1θ, b2θ, b3θ, b4θ, b5θ, b6θ, b7θ = bθs(rs, θ)
+#                         us[cur_t] = uprev + dtold*(
+#                             b1θ*k1 + b2θ*k2 + b3θ*k3 + b4θ*k4 + b5θ*k5 + b6θ*k6 + b7θ*k7)
+#                         cur_t += 1
+#                     end
+#                 end
+#             end
+#         end
+#     end
+
+#     if saveat === nothing && !save_everystep
+#         push!(us,u)
+#         push!(ts,t)
+#     end
+
+#     sol = DiffEqBase.build_solution(prob,Tsit5(),ts,us,calculate_error = false)
+
+#     DiffEqBase.has_analytic(prob.f) && DiffEqBase.calculate_solution_errors!(sol;timeseries_errors=true,dense_errors=false)
+
+#     sol
+# end
+