From c033de185b1f9638ea8bba33778541cdc15f84cb Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <aviatesk@gmail.com>
Date: Wed, 1 Nov 2023 21:54:01 +0900
Subject: [PATCH] carefully cache freshly-inferred edge for call-site inlining

Currently call-site inlining fails on freshly-inferred edge if its
source isn't inlineable. This happens because such sources are
exclusively cached globally. For successful call-site inlining, it's
necessary to cache these locally also, ensuring the inliner can access
them later.

To this end, the type of the `cache_mode` field of `InferenceState` has
been switched to `UInt8` from `Symbol`. This change allows it to
represent multiple caching strategies. A new caching mode can be
introduced, e.g. `VOLATILE_CACHE_MODE`, in the future.
---
 base/compiler/inferencestate.jl    | 45 ++++++++++++++++++++++++------
 base/compiler/typeinfer.jl         | 18 +++++++-----
 stdlib/REPL/src/REPLCompletions.jl |  4 +--
 test/compiler/inline.jl            | 12 ++++++++
 4 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/base/compiler/inferencestate.jl b/base/compiler/inferencestate.jl
index 96543308e8bd75..2552b7ac10ca9e 100644
--- a/base/compiler/inferencestate.jl
+++ b/base/compiler/inferencestate.jl
@@ -198,6 +198,10 @@ to enable flow-sensitive analysis.
 """
 const VarTable = Vector{VarState}
 
+const CACHE_MODE_NULL   = 0x00
+const CACHE_MODE_GLOBAL = 0x01 << 0
+const CACHE_MODE_LOCAL  = 0x01 << 1
+
 mutable struct InferenceState
     #= information about this method instance =#
     linfo::MethodInstance
@@ -240,7 +244,7 @@ mutable struct InferenceState
     # Whether to restrict inference of abstract call sites to avoid excessive work
     # Set by default for toplevel frame.
     restrict_abstract_call_sites::Bool
-    cache_mode::Symbol # TODO move this to InferenceResult?
+    cache_mode::UInt8 # TODO move this to InferenceResult?
     insert_coverage::Bool
 
     # The interpreter that created this inference state. Not looked at by
@@ -248,7 +252,7 @@ mutable struct InferenceState
     interp::AbstractInterpreter
 
     # src is assumed to be a newly-allocated CodeInfo, that can be modified in-place to contain intermediate results
-    function InferenceState(result::InferenceResult, src::CodeInfo, cache_mode::Symbol,
+    function InferenceState(result::InferenceResult, src::CodeInfo, cache_mode::Union{UInt8,Symbol},
                             interp::AbstractInterpreter)
         linfo = result.linfo
         world = get_world_counter(interp)
@@ -303,11 +307,21 @@ mutable struct InferenceState
         end
 
         restrict_abstract_call_sites = isa(def, Module)
-        @assert cache_mode === :no || cache_mode === :local || cache_mode === :global
+        if cache_mode isa Symbol
+            if cache_mode === :global
+                cache_mode = CACHE_MODE_GLOBAL
+            elseif cache_mode === :local
+                cache_mode = CACHE_MODE_LOCAL
+            elseif cache_mode === :no
+                cache_mode = CACHE_MODE_NULL
+            else
+                error("unexpected `cache_mode` is given")
+            end
+        end
 
         # some more setups
         InferenceParams(interp).unoptimize_throw_blocks && mark_throw_blocks!(src, handler_at)
-        cache_mode === :local && push!(get_inference_cache(interp), result)
+        !iszero(cache_mode & CACHE_MODE_LOCAL) && push!(get_inference_cache(interp), result)
 
         return new(
             linfo, world, mod, sptypes, slottypes, src, cfg, method_info,
@@ -446,13 +460,28 @@ function should_insert_coverage(mod::Module, src::CodeInfo)
     return false
 end
 
-function InferenceState(result::InferenceResult, cache::Symbol, interp::AbstractInterpreter)
+function InferenceState(result::InferenceResult, cache_mode::UInt8, interp::AbstractInterpreter)
     # prepare an InferenceState object for inferring lambda
     world = get_world_counter(interp)
     src = retrieve_code_info(result.linfo, world)
     src === nothing && return nothing
     validate_code_in_debug_mode(result.linfo, src, "lowered")
-    return InferenceState(result, src, cache, interp)
+    return InferenceState(result, src, cache_mode, interp)
+end
+InferenceState(result::InferenceResult, cache_mode::Symbol, interp::AbstractInterpreter) =
+    InferenceState(result, convert_cache_mode(cache_mode), interp)
+InferenceState(result::InferenceResult, src::CodeInfo, cache_mode::Symbol, interp::AbstractInterpreter) =
+    InferenceState(result, src, convert_cache_mode(cache_mode), interp)
+
+function convert_cache_mode(cache_mode::Symbol)
+    if cache_mode === :global
+        return CACHE_MODE_GLOBAL
+    elseif cache_mode === :local
+        return CACHE_MODE_LOCAL
+    elseif cache_mode === :no
+        return CACHE_MODE_NULL
+    end
+    error("unexpected `cache_mode` is given")
 end
 
 """
@@ -666,7 +695,7 @@ end
 function print_callstack(sv::InferenceState)
     while sv !== nothing
         print(sv.linfo)
-        sv.cache_mode === :global || print("  [uncached]")
+        is_cached(sv) || print("  [uncached]")
         println()
         for cycle in sv.callers_in_cycle
             print(' ', cycle.linfo)
@@ -764,7 +793,7 @@ frame_parent(sv::IRInterpretationState) = sv.parent::Union{Nothing,AbsIntState}
 is_constproped(sv::InferenceState) = any(sv.result.overridden_by_const)
 is_constproped(::IRInterpretationState) = true
 
-is_cached(sv::InferenceState) = sv.cache_mode === :global
+is_cached(sv::InferenceState) = !iszero(sv.cache_mode & CACHE_MODE_GLOBAL)
 is_cached(::IRInterpretationState) = false
 
 method_info(sv::InferenceState) = sv.method_info
diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl
index cab02a39c87c1b..a9140a965b5c7c 100644
--- a/base/compiler/typeinfer.jl
+++ b/base/compiler/typeinfer.jl
@@ -267,12 +267,12 @@ function _typeinf(interp::AbstractInterpreter, frame::InferenceState)
     end
     for caller in frames
         finish!(caller.interp, caller)
-        if caller.cache_mode === :global
+        if is_cached(caller)
             cache_result!(caller.interp, caller.result)
         end
         # Drop result.src here since otherwise it can waste memory.
-        # N.B. If the `cache_mode === :local`, the inliner may request to use it later.
-        if caller.cache_mode !== :local
+        # N.B. If cached locally, the inliner may request to use it later.
+        if iszero(caller.cache_mode & CACHE_MODE_LOCAL)
             caller.result.src = nothing
         end
     end
@@ -546,7 +546,7 @@ function finish(me::InferenceState, interp::AbstractInterpreter)
         # a parent may be cached still, but not this intermediate work:
         # we can throw everything else away now
         me.result.src = nothing
-        me.cache_mode = :no
+        me.cache_mode = CACHE_MODE_NULL
         set_inlineable!(me.src, false)
         unlock_mi_inference(interp, me.linfo)
     elseif limited_src
@@ -558,7 +558,7 @@ function finish(me::InferenceState, interp::AbstractInterpreter)
         # annotate fulltree with type information,
         # either because we are the outermost code, or we might use this later
         type_annotate!(interp, me)
-        doopt = (me.cache_mode !== :no || me.parent !== nothing)
+        doopt = (me.cache_mode != CACHE_MODE_NULL || me.parent !== nothing)
         # Disable the optimizer if we've already determined that there's nothing for
         # it to do.
         if may_discard_trees(interp) && is_result_constabi_eligible(me.result)
@@ -817,15 +817,19 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize
             # we already inferred this edge before and decided to discard the inferred code,
             # nevertheless we re-infer it here again and keep it around in the local cache
             # since the inliner will request to use it later
-            cache_mode = :local
+            cache_mode = CACHE_MODE_LOCAL
         else
             rt = cached_return_type(code)
             effects = ipo_effects(code)
             update_valid_age!(caller, WorldRange(min_world(code), max_world(code)))
             return EdgeCallResult(rt, mi, effects)
         end
+    elseif is_stmt_inline(get_curr_ssaflag(caller))
+        # if this fresh is going to be inlined, we cache it locally too so that the inliner
+        # can see it later even in a case when the inferred source is discarded from the global cache
+        cache_mode = CACHE_MODE_GLOBAL | CACHE_MODE_LOCAL
     else
-        cache_mode = :global # cache edge targets by default
+        cache_mode = CACHE_MODE_GLOBAL # cache edge targets globally by default
     end
     if ccall(:jl_get_module_infer, Cint, (Any,), method.module) == 0 && !generating_output(#=incremental=#false)
         add_remark!(interp, caller, "Inference is disabled for the target module")
diff --git a/stdlib/REPL/src/REPLCompletions.jl b/stdlib/REPL/src/REPLCompletions.jl
index 2c2646636b53a8..1da6c11b2a58ac 100644
--- a/stdlib/REPL/src/REPLCompletions.jl
+++ b/stdlib/REPL/src/REPLCompletions.jl
@@ -519,10 +519,10 @@ CC.bail_out_toplevel_call(::REPLInterpreter, ::CC.InferenceLoopState, ::CC.Infer
 # `REPLInterpreter` is specifically used by `repl_eval_ex`, where all top-level frames are
 # `repl_frame` always. However, this assumption wouldn't stand if `REPLInterpreter` were to
 # be employed, for instance, by `typeinf_ext_toplevel`.
-is_repl_frame(sv::CC.InferenceState) = sv.linfo.def isa Module && sv.cache_mode === :no
+is_repl_frame(sv::CC.InferenceState) = sv.linfo.def isa Module && sv.cache_mode === CC.CACHE_MODE_NULL
 
 function is_call_graph_uncached(sv::CC.InferenceState)
-    sv.cache_mode === :global && return false
+    sv.cache_mode === CC.CACHE_MODE_GLOBAL && return false
     parent = sv.parent
     parent === nothing && return true
     return is_call_graph_uncached(parent::CC.InferenceState)
diff --git a/test/compiler/inline.jl b/test/compiler/inline.jl
index a5ad45b3281dfe..9ef2fdbc34f816 100644
--- a/test/compiler/inline.jl
+++ b/test/compiler/inline.jl
@@ -570,6 +570,18 @@ end
     end
 end
 
+@noinline fresh_edge_noinlined(a::Integer) = unresolvable(a)
+let src = code_typed1((Integer,)) do x
+        @inline fresh_edge_noinlined(x)
+    end
+    @test count(iscall((src, fresh_edge_noinlined)), src.code) == 0
+end
+let src = code_typed1((Integer,)) do x
+        @inline fresh_edge_noinlined(x)
+    end
+    @test count(iscall((src, fresh_edge_noinlined)), src.code) == 0 # should be idempotent
+end
+
 # force constant-prop' for `setproperty!`
 # https://github.com/JuliaLang/julia/pull/41882
 let code = @eval Module() begin