diff --git a/src/Salsa.jl b/src/Salsa.jl
index 151b829..89fa5cb 100644
--- a/src/Salsa.jl
+++ b/src/Salsa.jl
@@ -19,14 +19,10 @@ using .Debug
 
 # For each Salsa call that is defined, there will be a unique instance of `AbstractKey` that
 # identifies requests for the cached value (ie accessing inputs or calling derived
-# functions). Instances of these key types are used to identify _which methods_ the
-# remaining arguments (as a CallArgs tuple) are the key to.
-# e.g.  foo(rt, 2,3,5) -> DerivedKey{Foo, (Int,Int,Int)}()
-abstract type AbstractKey end
-# Dependencies between Salsa computations are represented via instances of DependencyKey,
+# functions). Instances of these key types are used to identify both _which method_ was
+# called, and what the arguments to that call were.
+# Dependencies between Salsa computations are represented via instances of these keys,
 # which specify everything needed to rerun the computation: (key, function-args)
-# The `key` is the `AbstractKey` instance that specifies _which computation_ was performed
-# (above) and the `args` is a Tuple of the user-provided arguments to that call.
 # The stored arguments do not include the Salsa Runtime object itself, since this is always
 # present, and changes as computations are performed.
 # Given:
@@ -34,47 +30,44 @@ abstract type AbstractKey end
 #   @derived function foo(rt, x::Int, y::Any, z::Number) ... end
 #   @derived function foo(rt, x,y,z) ... end
 # Examples:
-#   foo(rt,1,2,3)  -> DependencyKey(key=DerivedKey{typeof(foo), Tuple{Int,Any,Number}}(),
-#                                   args=(1, 2, 3))
-#   input_str(1,2) -> DependencyKey(key=InputKey{typeof(input_str), Tuple{Int,Int}}(),
-#                                   args=(1, 2))
-Base.@kwdef struct DependencyKey{KT<:AbstractKey,ARG_T<:Tuple}
-    key::KT
-    args::ARG_T  # NOTE: After profiling, typing this _does_ reduce allocations ✔︎
-end
-# TODO(NHD): Use AutoHashEquals.jl here instead to autogenerate these.
-# Note that floats should be compared for equality, not NaN-ness
-function Base.:(==)(x1::DependencyKey, x2::DependencyKey)
-    return isequal(x1.key, x2.key) && isequal(x1.args, x2.args)
-end
-function Base.isless(x1::DependencyKey, x2::DependencyKey)
-    return isequal(x1.key, x2.key) ? isless(x1.args, x2.args) : isless(x1.key, x2.key)
+#  derived_foo(rt, 2,3) -> DerivedKey{typeof(derived_foo)}((2,3))
+#  input_bar(rt, 2,3) -> InputKey{typeof(input_bar)}((2,3))
+#   foo(rt,1,2,3)  -> DerivedKey{typeof(foo), Tuple{Int,Any,Number}}((1, 2, 3))
+#   input_str(1,2) -> InputKey{typeof(input_str), Tuple{Int,Int}}((1, 2))
+abstract type AbstractKey end
+struct DerivedKey{F<:Function,TT<:Tuple} <: AbstractKey
+    args::TT
 end
-Base.hash(x::DependencyKey, h::UInt) = hash(x.key, hash(x.args, hash(:DependencyKey, h)))
-
+DerivedKey{F}(args::TT) where {F<:Function,TT<:Tuple} = DerivedKey{F,TT}(args)
 # NOTE: After several iterations, the InputKeys are now essentially identical to the
 # DerivedKeys. They only differ to allow distinguishing them for dispatch. We might want to
 # do some refactoring to share more code below.
-struct InputKey{F<:Function,TT<:Tuple{Vararg{Any}}} <: AbstractKey end
+struct InputKey{F<:Function,TT<:Tuple} <: AbstractKey
+    args::TT
+end
+InputKey{F}(args::TT) where {F<:Function,TT<:Tuple} = InputKey{F,TT}(args)
+# TODO: Probably don't need both this union and the abstract type. Just pick one.
+# Convenience Union for sharing code.
+const DependencyKey{F,TT} = Union{DerivedKey{F,TT}, InputKey{F,TT}}
 
-# A DerivedKey{F, TT} is stored in the dependencies of a Salsa derived function, in order to
-# represent a call to another derived function.
-# E.g. Given `@derived foo(::MyComponent,::Int,::Int)`, then calling `foo(component,2,3)`
-# would store a dependency as this _DependencyKey_ (defined above):
-#   `DependencyKey(key=DerivedKey{foo, (MyComponent,Int,Int)}(), args=(component,2,3))`
-# TV: used to specify which method for a function with multiple methods.
-struct DerivedKey{F<:Function,TT<:Tuple{Vararg{Any}}} <: AbstractKey end
+# TODO(NHD): Use AutoHashEquals.jl here instead to autogenerate these.
+# Note that floats should be compared for equality, not NaN-ness
+function Base.:(==)(x1::DK, x2::DK) where DK <: DependencyKey
+    return isequal(x1.args, x2.args)
+end
+function Base.isless(x1::DK, x2::DK) where DK <: DependencyKey
+    return isless(x1.args, x2.args)
+end
+Base.hash(x::DerivedKey, h::UInt) = hash(x.args, hash(:DerivedKey, h))
+Base.hash(x::InputKey, h::UInt) = hash(x.args, hash(:InputKey, h))
 
 # Override `Base.show` to minimize redundant printing (skip module name).
+# Don't print the TT type on the DependencyKey, since it's recovered by the fields.
 function Base.show(io::IO, key::InputKey{F,TT}) where {F,TT}
-    print(io, "InputKey{$F,$TT}()")
+    print(io, "InputKey{$F}($(key.args))")
 end
 function Base.show(io::IO, key::DerivedKey{F,TT}) where {F,TT}
-    print(io, "DerivedKey{$F,$TT}()")
-end
-# Don't print the type on the DependencyKey, since it's recovered by the fields.
-function Base.show(io::IO, dep::DependencyKey)
-    print(io, "DependencyKey(key=$(repr(dep.key)), args=$(dep.args))")
+    print(io, "DerivedKey{$F}($(key.args))")
 end
 
 # Pretty-print a DependencyKey for tracing and printing in SalsaWrappedExceptions:
@@ -82,7 +75,7 @@ end
 # @derived foo(::Runtime, 1::Int, 2::Any, 3::Number)
 function _print_dep_key_as_call(
     io::IO,
-    dependency::DependencyKey{<:Union{InputKey{F,TT},DerivedKey{F,TT}}},
+    dependency::DependencyKey{F,TT}
 ) where {F,TT}
     call_args = dependency.args
     f = isdefined(F, :instance) ? nameof(F.instance) : nameof(F)
@@ -94,11 +87,11 @@ function _print_dep_key_as_call(
     f_str = string(:(($f,)))[2:end-2]  # (wraps f name in var"" if needed)
     print(io, "$f_str($(join(argsexprs, ", ")))")
 end
-function Base.print(io::IO, dependency::DependencyKey{<:InputKey})
+function Base.print(io::IO, dependency::InputKey)
     print(io, "@input ")
     _print_dep_key_as_call(io, dependency)
 end
-function Base.print(io::IO, dependency::DependencyKey{<:DerivedKey})
+function Base.print(io::IO, dependency::DerivedKey)
     print(io, "@derived ")
     _print_dep_key_as_call(io, dependency)
 end
@@ -230,14 +223,33 @@ mutable struct TraceOfDependencyKeys
     # It's an immutable structure (linked list) to make it thread-safe.
     call_stack::Union{Nothing,SalsaStackFrame}
 
+    # This is set to false during recursive "still_valid" checks to avoid unused dependency
+    # tracking, to save time and allocs.
+    should_trace::Bool
+
     # We always create a new, empty TraceOfDependencyKeys for each derived function,
     # since we're only tracing the immediate dependencies of that function.
     function TraceOfDependencyKeys()
-        new(Vector{DependencyKey}(), Set{DependencyKey}(), Base.ReentrantLock(), nothing)
+        # Pre-allocate all traces to be non-empty, to minimize allocations at runtime.
+        # These traces are all constructed once ahead of time in the per-thread trace pools,
+        # so this initialization is only done once during Module __init__().
+        # TODO: Tune this. Too big wastes RAM (though, it's fixed cost up front).
+        #       Current size, 30, adds about 1MiB, which seems not bad.
+        N = 30
+        return new(
+            sizehint!(Vector{DependencyKey}(), N),
+            sizehint!(Set{DependencyKey}(), N),
+            Base.ReentrantLock(),
+            nothing,
+            true)
     end
 end
 
 function push_key!(trace::TraceOfDependencyKeys, depkey)
+    # (Don't need to lock around this since it can never be modified in parallel.)
+    if !trace.should_trace
+        return
+    end
     @lock trace.lock begin
         # Performance Optimization: De-duplicating Derived Function Traces
         if depkey ∉ trace.seen_deps
@@ -306,8 +318,12 @@ struct _TracingRuntime{CT,ST<:AbstractSalsaStorage} <: Runtime{CT,ST}
     )::_TracingRuntime{CT,ST} where {CT,ST<:AbstractSalsaStorage}
         new{CT,ST}(
             reinterpret(Ptr{ST}, pointer_from_objref(old_rt)),
-            # Start a new, empty trace with the provided call stack.
-            get_trace_with_call_stack(SalsaStackFrame(key, nothing)),
+            # Start a new, empty trace (with the provided call stack if in debug mode)
+            if Salsa.Debug.debug_enabled()
+                get_trace_with_call_stack(SalsaStackFrame(key, nothing))
+            else
+                get_trace_with_call_stack(nothing)
+            end,
         )
     end
 
@@ -317,19 +333,28 @@ struct _TracingRuntime{CT,ST<:AbstractSalsaStorage} <: Runtime{CT,ST}
     )::_TracingRuntime{CT,ST} where {CT,ST<:AbstractSalsaStorage}
         # Push the new computation onto the current Runtime (if it's not there already)
         push_key!(old_rt, key)
-        # Create a new linked list node pointing to the old stack trace.
-        old_trace = get_trace(old_rt.immediate_dependencies_id)
-        new_call_stack = SalsaStackFrame(key, old_trace.call_stack)
-        new{CT,ST}(old_rt.tl_runtime, get_trace_with_call_stack(new_call_stack))
+        # Create a new linked list node (pointing to the old stack trace if debug mode).
+        new_trace = if Salsa.Debug.debug_enabled()
+            old_trace = get_trace(old_rt.immediate_dependencies_id)
+            get_trace_with_call_stack(SalsaStackFrame(key, old_trace.call_stack))
+        else
+            get_trace_with_call_stack(nothing)
+        end
+        new{CT,ST}(old_rt.tl_runtime, new_trace)
     end
 end
 
 # We store the call_stack in the Trace instead of in the runtime, because the trace is a
 # linked-list (meaning not isbits), and we want to keep the Runtime isbits to avoid
 # allocations. Since the Traces are pre-allocated it's okay for them to contain heap ptrs.
-@inline function get_trace_with_call_stack(call_stack::SalsaStackFrame)
+@inline function get_trace_with_call_stack(
+    call_stack::Union{SalsaStackFrame,Nothing}
+    )
     trace_id = get_free_trace_id()
-    get_trace(trace_id).call_stack = call_stack
+    # Set the stack frame if running in debug mode.
+    if call_stack isa SalsaStackFrame
+        get_trace(trace_id).call_stack = call_stack
+    end
     return trace_id
 end
 
@@ -560,8 +585,7 @@ macro derived(f)
     # NOTE: I am PRETTY SURE it's okay to eval here. Function definitions already require
     # argument *types* to be defined already, so evaling the types should be A OKAY!
     args_typetuple = Tuple(Core.eval(__module__, t) for t in argtypes)
-    # TODO: Use the returntype to strongly type the DefualtStorage dictionaries!
-    # TODO: Use base's deduced return type (can be more specific) for DefaultStorage.
+    # TODO: Use the returntype to strongly type the DefaultStorage dictionaries!
     returntype_assertion = Core.eval(__module__, get(dict, :rtype, Any))
     TT = Tuple{args_typetuple[2:end]...}
 
@@ -585,15 +609,20 @@ macro derived(f)
     dict[:name] = userfname
     userfunc = MacroTools.combinedef(dict)
 
-    derived_key_t = :($DerivedKey{typeof($fname),$TT})  # Use type of function, not obj, because closures are not isbits
-    derived_key = :($derived_key_t())
+    full_TT = Tuple{Runtime, args_typetuple[2:end]...}
 
     # Construct the originally named, visible function
     dict[:name] = fname
     dict[:args] = fullargs
     dict[:body] = quote
-        key = $DependencyKey(key = $derived_key, args = ($(argnames[2:end]...),))
-        $memoized_lookup_unwrapped($(argnames[1]), key)::$returntype_assertion
+        args = ($(argnames[2:end]...),)
+        key = $DerivedKey{typeof($fname)}(args)
+        # TODO: Without this, derived functions are all type unstable, unless the user puts
+        # a return type annotation on the function.. :( But we have to turn this off
+        # because the compiler is hanging, taking >1 hour in Delve.
+        # TODO: File an issue about this. This shouldn't be happening!
+        #RT = $(Core.Compiler.return_type)($userfname, typeof(($(argnames[1]), args...)))
+        $memoized_lookup_unwrapped($(argnames[1]), key) #::RT
     end
     visible_func = MacroTools.combinedef(dict)
 
@@ -604,12 +633,11 @@ macro derived(f)
             # Attach any docstring before this macrocall to the "visible" function.
             Core.@__doc__ $visible_func
 
-            function $Salsa.invoke_user_function(
+            function $Salsa.get_user_function(
                 $(fullargs[1]),
-                ::$derived_key_t,
-                $(fullargs[2:end]...),
+                ::$DerivedKey{typeof($fname), <:Tuple{$(argtypes[2:end]...)}},
             )
-                return $userfname($(argnames...))
+                return $userfname
             end
 
             $fname
@@ -617,7 +645,7 @@ macro derived(f)
     )
 end
 # Methods added by the @derived macro, above.
-function invoke_user_function end
+function get_user_function end
 
 
 # We @nospecialize the dependency key argument here for compiler performance.
@@ -654,7 +682,7 @@ function memoized_lookup(rt::Runtime, dependency_key::DependencyKey)
             rethrow()
         end
     finally
-        Salsa.release_trace_id(rt.immediate_dependencies_id)
+        release_trace_id(rt.immediate_dependencies_id)
     end
 end
 
@@ -778,10 +806,7 @@ macro declare_input(e::Expr)
     # Build the Key type here, at macro parse time, since it's expensive to construct at runtime.
     # (Use type of function, not obj, because closures are not isbits)
     input_key_t = InputKey{typeof(getter_f),TT}
-    input_key = input_key_t()
-    #dependency_key_t = DependencyKey{input_key_t}
-    dependency_key_expr =
-        :($DependencyKey(key = $input_key, args = ($(argnames[2:end]...),)))
+    dependency_key_expr = :($input_key_t(($(argnames[2:end]...),)))
     getter_body = quote
         $memoized_lookup_unwrapped($runtime_arg, $dependency_key_expr)::$value_t
     end
@@ -813,6 +838,15 @@ macro declare_input(e::Expr)
         Core.@__doc__ $getter
         $setter
         $deleter
+
+        # For type stability
+        @inline function $Salsa.get_user_function(
+            $(fullargs[1]),
+            ::$input_key_t,
+        )
+            return $getter
+        end
+
         # Return all the generated functions as a hint to REPL users what we're generating.
         ($inputname, $setter_name, $deleter_name)
     end)
diff --git a/src/default_storage.jl b/src/default_storage.jl
index 56d7289..842b5ee 100644
--- a/src/default_storage.jl
+++ b/src/default_storage.jl
@@ -2,13 +2,21 @@ module _DefaultSalsaStorage
 
 import ..Salsa
 using ..Salsa:
-    Runtime, AbstractSalsaStorage, memoized_lookup, invoke_user_function, collect_trace
+    Runtime, AbstractSalsaStorage, memoized_lookup, get_user_function, collect_trace
 using ..Salsa: DependencyKey, DerivedKey, InputKey, _storage, RuntimeWithStorage,
     _TopLevelRuntimeWithStorage, _TracingRuntimeWithStorage
+using Base.Threads: Atomic, atomic_add!, atomic_sub!
+using Base: @lock
 
 import ..Salsa.Debug: @debug_mode, @dbg_log_trace
 
-using Base: @lock
+# --------------------
+# TODO:
+# - Investigate @nospecialize on user arguments for compiler performance?
+#    - We tried this, but saw perf regressions, so we maybe don't understand the
+#      specializations that are going on.
+# --------------------
+
 
 const Revision = Int
 
@@ -43,7 +51,7 @@ _changed_at(v::DerivedValue)::Revision = v.changed_at
 
 
 const InputMapType = IdDict{InputKey,Dict}
-const DerivedFunctionMapType = IdDict{DerivedKey,Dict}
+const DerivedFunctionMapType = IdDict{Type{<:DerivedKey},Dict}
 
 mutable struct DefaultStorage <: AbstractSalsaStorage
     # The entire Salsa storage is protected by this lock. All accesses and
@@ -65,15 +73,15 @@ mutable struct DefaultStorage <: AbstractSalsaStorage
     # opposite of what would be best: Since DerivedValues are not isbits, they will always
     # be heap allocated, so there's no reason to strongly type their dict. But inputs can
     # be isbits, so it's probably worth specailizing them.
-    inputs_map::Dict{Tuple{Type,Tuple},InputValue}
+    inputs_map::Dict{InputKey,InputValue}
     derived_function_maps::DerivedFunctionMapType
 
     # Tracks whether there are any derived functions currently active. It is an error to
     # modify any inputs while derived functions are active, on the current Task or any Task.
-    derived_functions_active::Int
+    derived_functions_active::Atomic{Int}
 
     function DefaultStorage()
-        new(Base.ReentrantLock(), 0, InputMapType(), DerivedFunctionMapType(), 0)
+        new(Base.ReentrantLock(), 0, InputMapType(), DerivedFunctionMapType(), Atomic{Int}(0))
     end
 end
 
@@ -90,9 +98,11 @@ end
 # NOTE: This implements the dynamic behavior for Salsa Components, allowing users to define
 # input/derived function dynamically, by attaching new Dicts for them to the storage at
 # runtime.
-function get_map_for_key(storage::DefaultStorage, key::DerivedKey{<:Any,TT}) where {TT}
+function get_map_for_key(
+    storage::DefaultStorage, ::KT, ::Type{RT}
+) where {TT, KT<:DerivedKey{<:Any, TT}, RT}
     @lock storage.lock begin
-        return get!(storage.derived_function_maps, key) do
+        return get!(storage.derived_function_maps, KT) do
             # PERFORMANCE NOTE: Only construct key inside this do-block to
             # ensure expensive constructor only called once, the first time.
 
@@ -101,8 +111,9 @@ function get_map_for_key(storage::DefaultStorage, key::DerivedKey{<:Any,TT}) whe
             #       in the existing open-source Salsa.
             # NOTE: Except actually after https://github.com/RelationalAI-oss/Salsa.jl/issues/11
             #       maybe we won't do this anymore, and we'll just use one big dictionary!
-            Dict{TT,DerivedValue}()
-        end
+            Dict{TT,DerivedValue{RT}}()
+        # NOTE: Somehow, julia has trouble deducing this return value!
+        end::Dict{TT,DerivedValue{RT}}  # This type assertion reduces allocations by 2!!
     end
 end
 function get_map_for_key(storage::DefaultStorage, ::InputKey)
@@ -114,16 +125,15 @@ end
 
 function Salsa._previous_output_internal(
     runtime::Salsa._TracingRuntimeWithStorage{DefaultStorage},
-    key::DependencyKey{<:DerivedKey},
+    key::DerivedKey,
 )
     storage = _storage(runtime)
-    derived_key, args = key.key, key.args
+    derived_key, args = key, key.args
 
     previous_output = nothing
 
     @lock storage.lock begin
         cache = get_map_for_key(storage, derived_key)
-
         if haskey(cache, args)
             previous_output = getindex(cache, args)
         end
@@ -133,23 +143,25 @@ function Salsa._previous_output_internal(
 end
 
 
-# TODO: I think we can @nospecialize the arguments for compiler performance?
-# TODO: It doesn't seem like this @nospecialize is working... It still seems to be compiling
-# a nospecialization for every argument type. :(
 function Salsa._memoized_lookup_internal(
     runtime::Salsa._TracingRuntimeWithStorage{DefaultStorage},
-    key::DependencyKey{<:DerivedKey},
-)
+    key::DerivedKey{F,TT},
+)  where {F,TT}
     storage = _storage(runtime)
     try  # For storage.derived_functions_active
-        @lock storage.lock begin
-            storage.derived_functions_active += 1
-        end
+        atomic_add!(storage.derived_functions_active, 1)
+
+        local existing_value, value
+        found_existing = false
+        should_run_user_func = true
 
-        existing_value = nothing
-        value = nothing
+        derived_key, args = key, key.args
 
-        derived_key, args = key.key, key.args
+        user_func = get_user_function(runtime, derived_key)
+        # Always just box all the results in the same structure for max type stability,
+        # since we don't gain anything by knowing the type here anyway. We just have
+        # to rely on julia to deduce the return type at the very end.
+        RT = Any
 
         # NOTE: We currently make no attempts to prevent two Tasks from simultaneously
         # computing the same derived function for the same key. For cheap derived functions
@@ -166,22 +178,25 @@ function Salsa._memoized_lookup_internal(
         # violations, e.g. overwriting newer values with outdated results.
         lock_held::Bool = false
         local cache
+        trace = Salsa.get_trace(runtime.immediate_dependencies_id) :: Salsa.TraceOfDependencyKeys
         try
             lock(storage.lock)
             lock_held = true
 
-            cache = get_map_for_key(storage, derived_key)
+            cache = get_map_for_key(storage, derived_key, RT)
 
             if haskey(cache, args)
-                # TODO: Optimization idea:
+                # Optimization: Skip tracing dependencies when check still_valid on values
                 #   - There's no reason to be tracing the Salsa functions during
-                #     the `still_valid` check, since we're not going to use them. We might
-                #     _do_ still want to keep the stack trace though for cycle detection and
+                #     the `still_valid` check, since we're not going to use them. We _do_
+                #     still want to keep the stack trace though for cycle detection and
                 #     error messages.
-                #   - We might want to consider keeping some toggle on the Trace object
-                #     itself, to allow us to skip recording the deps for this phase.
+                #   - So we set this toggle on the Trace object itself, to allow us to skip
+                #     recording the deps for this phase.
+                trace.should_trace = false
 
-                existing_value = getindex(cache, args)
+                existing_value = getindex(cache, args)::DerivedValue{RT}
+                found_existing = true
                 unlock(storage.lock)
                 lock_held = false
 
@@ -195,6 +210,7 @@ function Salsa._memoized_lookup_internal(
                 # value will be stable across the lifetime of this function.
                 if existing_value.verified_at == storage.current_revision
                     value = existing_value
+                    should_run_user_func = false
                     # NOTE: still_valid() will recursively call memoized_lookup, potentially
                     #       recomputing all our recursive dependencies.
                 elseif still_valid(runtime, existing_value)
@@ -203,6 +219,7 @@ function Salsa._memoized_lookup_internal(
                     # without a lock, due to asserts on derived_functions_active.
                     existing_value.verified_at = storage.current_revision
                     value = existing_value
+                    should_run_user_func = false
                 end
             end
         finally
@@ -210,24 +227,40 @@ function Salsa._memoized_lookup_internal(
                 unlock(storage.lock)
                 lock_held = false
             end
+            trace.should_trace = true
         end
 
         # At this point (value == nothing) if (and only if) the args are not
         # in the cache, OR if they are in the cache, but they are no longer valid.
-        if value === nothing    # N.B., do not use `isnothing`
-            # TODO: Optimization idea:
-            #   - If `existing_value !== nothing` here, we can avoid an allocation and a
-            #     copy by _swapping_ the `trace`'s `ordered_dependencies` with
-            #     `value.dependencies`, so that the deps are written in-place directly into
-            #     their final destination! :)
-
+        if should_run_user_func
             @dbg_log_trace @info "invoking $key"
-            v = invoke_user_function(runtime, key.key, key.args...)
+            if found_existing
+                # Dependency array swap Optimization:
+                #   If we've already got an `existing_value` object here, we can avoid an
+                #   allocation and a copy by _swapping_ the `trace`'s `ordered_dependencies`
+                #   with `existing_value.dependencies`, so that the deps are written
+                #   in-place directly into their final destination! :)
+                trace = Salsa.get_trace(runtime.immediate_dependencies_id)
+                # Temporarily swap the dependency vectors while running user_func so the
+                # deps are recorded in-place. Note that we must swap them back at the end.
+                existing_value.dependencies, trace.ordered_deps =
+                    trace.ordered_deps, existing_value.dependencies
+                try
+                    v = user_func(runtime, key.args...)
+                finally
+                    # Swap back the dependency vectors so the vector isn't modified by
+                    # future traces.
+                    existing_value.dependencies, trace.ordered_deps =
+                        trace.ordered_deps, existing_value.dependencies
+                end
+            else
+                v = user_func(runtime, key.args...)
+            end
             # NOTE: We use `isequal` for the Early Exit Optimization, since values are
             # required to be purely immutable (but not necessarily julia `immutable
             # structs`).
             @dbg_log_trace @info "Returning from $key."
-            if existing_value !== nothing && isequal(existing_value.value, v)
+            if found_existing && isequal(existing_value.value, v)
                 # Early Exit Optimization Part 2: (for Part 1 see `set_input!`, below).
                 # If a derived function computes the exact same value, we can terminate
                 # early and "backdate" the changed_at field to say this value has _not_
@@ -238,14 +271,28 @@ function Salsa._memoized_lookup_internal(
                 # Note that just because it computed the same value, it doesn't mean it
                 # computed it in the same way, so we need to update the list of
                 # dependencies as well.
-                existing_value.dependencies = collect_trace(runtime)
+                # HOWEVER, we can actually skip this, thanks to the swap-optimization above.
+                # existing_value.dependencies = collect_trace(runtime)
+
                 # We keep the old computed `.value` rather than the new value to help catch
                 # bugs with users' over-permissive `isequal()` functions earlier.
                 value = existing_value
+            elseif found_existing
+                # Reuse as much of the existing_value's structure to avoid allocations
+                existing_value.value = v
+                # We skip this thanks to the swap-optimization, above.
+                # existing_value.dependencies = collect_trace(runtime)
+                existing_value.changed_at = storage.current_revision
+                existing_value.verified_at = storage.current_revision
+                value = existing_value
             else
                 @dbg_log_trace @info "Computed new derived value for $key."
                 # The user function computed a new value, which we must now store.
-                value = DerivedValue(
+                # NOTE: We set the computed RT here, which might be more abstract than the
+                # actual type of the value, `v`.
+                # The other option is to change the dicts to be Dict{K, DerivedValue{<:RT}},
+                # but that causes extra allocations.
+                value = DerivedValue{RT}(
                     v,
                     collect_trace(runtime),
                     storage.current_revision,
@@ -259,16 +306,15 @@ function Salsa._memoized_lookup_internal(
 
         return value
     finally
-        @lock storage.lock begin
-            storage.derived_functions_active -= 1
-        end
+        atomic_sub!(storage.derived_functions_active, 1)
     end
 end # _memoized_lookup_internal
 
 function Salsa._unwrap_salsa_value(
     runtime::RuntimeWithStorage{DefaultStorage},
-    v::Union{DerivedValue{T},InputValue{T}},
-)::T where {T}
+    # Note: the presence of a `where T` on this function causes allocations.
+    v #=::Union{DerivedValue,InputValue} =#
+)
     return v.value
 end
 
@@ -291,34 +337,28 @@ end
 
 # --- Inputs --------------------------------------------------------------------------
 
-# TODO: I think we can @nospecialize the arguments for compiler performance?
 function Salsa._memoized_lookup_internal(
     runtime::Salsa._TracingRuntimeWithStorage{DefaultStorage},
-    # TODO: It doesn't look like this nospecialize is actually doing anything...
-    key::DependencyKey{<:InputKey{F}},
-) where {F}
-    typedkey, call_args = key.key, key.args
-    cache_key = (F, call_args)
+    key::InputKey,
+)
     storage = _storage(runtime)
-    cache = get_map_for_key(storage, typedkey)
+    cache = get_map_for_key(storage, key)
     @lock storage.lock begin
-        return cache[cache_key]
+        return cache[key]
     end
 end
 
 function Salsa.set_input!(
     runtime::_TopLevelRuntimeWithStorage{DefaultStorage},
-    key::DependencyKey{<:InputKey{F}},
-    value::T,
-) where {F,T}
+    key::InputKey,
+    value,
+)
     storage = _storage(runtime)
-    typedkey, call_args = key.key, key.args
-    cache_key = (F, call_args)
 
     @lock storage.lock begin
-        cache = get_map_for_key(storage, typedkey)
+        cache = get_map_for_key(storage, key)
 
-        if haskey(cache, cache_key) && _value_isequal_to_cached(cache[cache_key], value)
+        if haskey(cache, key) && _value_isequal_to_cached(cache[key], value)
             # Early Exit Optimization Part 1: Don't dirty anything if setting exactly the
             # same value for an input.
             return
@@ -326,12 +366,12 @@ function Salsa.set_input!(
 
         # It is an error to modify any inputs while derived functions are active, even
         # concurrently on other threads.
-        @assert storage.derived_functions_active == 0
+        @assert storage.derived_functions_active[] == 0
 
         @dbg_log_trace @info "Setting input $key => $value"
         storage.current_revision += 1
 
-        cache[cache_key] = InputValue(value, storage.current_revision)
+        cache[key] = InputValue(value, storage.current_revision)
         return nothing
     end
 end
@@ -346,21 +386,19 @@ end
 
 function Salsa.delete_input!(
     runtime::_TopLevelRuntimeWithStorage{DefaultStorage},
-    key::DependencyKey{<:InputKey{F}},
-) where {F}
+    key::InputKey,
+)
     @dbg_log_trace @info "Deleting input $key"
     storage = _storage(runtime)
-    typedkey, call_args = key.key, key.args
-    cache_key = (F, call_args)
+    cache = get_map_for_key(storage, key)
 
     @lock storage.lock begin
         # It is an error to modify any inputs while derived functions are active, even
         # concurrently on other threads.
-        @assert storage.derived_functions_active == 0
+        @assert storage.derived_functions_active[] == 0
 
         storage.current_revision += 1
-        cache = get_map_for_key(storage, typedkey)
-        delete!(cache, cache_key)
+        delete!(cache, key)
         return nothing
     end
 end
diff --git a/test/Salsa.jl b/test/Salsa.jl
index 937934a..4f683a0 100644
--- a/test/Salsa.jl
+++ b/test/Salsa.jl
@@ -318,35 +318,35 @@ end
 end
 
 
-const NUM_TRACE_TEST_CALLS = Salsa.N_INIT_TRACES + 5  # Plus a few extra for good measure.
-
-# NOTE: This test is testing internal aspects of the package, not the public API.
-@testset "Growing the trace pool freelist" begin
-    @derived function recursive_cause_pool_growth(rt, n::Int)::Int
-        # Verify that things still work after at least one pool growth
-        if n <= NUM_TRACE_TEST_CALLS
-            return recursive_cause_pool_growth(rt, n+1) + 1
-        else
-            return base_value(rt)
-        end
-    end
+# const NUM_TRACE_TEST_CALLS = Salsa.N_INIT_TRACES + 5  # Plus a few extra for good measure.
 
-    @declare_input base_value(rt)::Int
+# # NOTE: This test is testing internal aspects of the package, not the public API.
+# @testset "Growing the trace pool freelist" begin
+#     @derived function recursive_cause_pool_growth(rt, n::Int)::Int
+#         # Verify that things still work after at least one pool growth
+#         if n <= NUM_TRACE_TEST_CALLS
+#             return recursive_cause_pool_growth(rt, n+1) + 1
+#         else
+#             return base_value(rt)
+#         end
+#     end
 
-    rt = new_test_rt()
+#     @declare_input base_value(rt)::Int
 
-    set_base_value!(rt, 0)
+#     rt = new_test_rt()
 
-    # Create more than Salsa.N_INIT_TRACES derived function calls to force a growth
-    # event of the trace pool + freelist.
-    @test recursive_cause_pool_growth(rt, 1) == NUM_TRACE_TEST_CALLS
+#     set_base_value!(rt, 0)
 
-    # Now test that the dependencies were recorded correctly, and everything reruns
-    Salsa.new_epoch!(rt)
-    set_base_value!(rt, 1)
+#     # Create more than Salsa.N_INIT_TRACES derived function calls to force a growth
+#     # event of the trace pool + freelist.
+#     @test recursive_cause_pool_growth(rt, 1) == NUM_TRACE_TEST_CALLS
 
-    @test recursive_cause_pool_growth(rt, 1) == NUM_TRACE_TEST_CALLS + 1
-end
+#     # Now test that the dependencies were recorded correctly, and everything reruns
+#     Salsa.new_epoch!(rt)
+#     set_base_value!(rt, 1)
+
+#     @test recursive_cause_pool_growth(rt, 1) == NUM_TRACE_TEST_CALLS + 1
+# end
 
 @testset "task parallel derived functions invalidation" begin
     @declare_input i(_, ::Int)::Int