From e7e8b8900d471f38f90d7150c5123c503aa1ac36 Mon Sep 17 00:00:00 2001
From: Diogo Netto <61364108+d-netto@users.noreply.github.com>
Date: Wed, 27 Dec 2023 15:59:53 -0300
Subject: [PATCH 01/13] update --gcthreads section in command line options
 (#52645)

Make these consistent with what's shown by `julia --help`.

Fixes https://github.com/JuliaLang/www.julialang.org/issues/1997.
---
 doc/man/julia.1                          | 6 +++---
 doc/src/manual/command-line-interface.md | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/doc/man/julia.1 b/doc/man/julia.1
index b0af07539cb41..ee0f973c259fd 100644
--- a/doc/man/julia.1
+++ b/doc/man/julia.1
@@ -123,9 +123,9 @@ process affinity is not configured, it uses the number of CPU
 threads.
 
 .TP
---gcthreads <n>
-Enable n GC threads; If unspecified is set to half of the
-compute worker threads.
+--gcthreads=N[,M]
+Use N threads for the mark phase of GC and M (0 or 1) threads for the concurrent sweeping phase of GC.
+N is set to half of the number of compute threads and M is set to 0 if unspecified.
 
 .TP
 -p, --procs {N|auto}
diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md
index c69d221a80f2e..be442e456c2e0 100644
--- a/doc/src/manual/command-line-interface.md
+++ b/doc/src/manual/command-line-interface.md
@@ -174,7 +174,7 @@ The following is a complete list of command-line switches available when launchi
 |`-E`, `--print <expr>`                 |Evaluate `<expr>` and display the result|
 |`-L`, `--load <file>`                  |Load `<file>` immediately on all processors|
 |`-t`, `--threads {N\|auto}`            |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future.  Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.|
-| `--gcthreads {N}`                     |Enable N GC threads; If unspecified is set to half of the compute worker threads.|
+| `--gcthreads=N[,M]`                   |Use N threads for the mark phase of GC and M (0 or 1) threads for the concurrent sweeping phase of GC. N is set to half of the number of compute threads and M is set to 0 if unspecified.|
 |`-p`, `--procs {N\|auto}`              |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)|
 |`--machine-file <file>`                |Run processes on hosts listed in `<file>`|
 |`-i`                                   |Interactive mode; REPL runs and `isinteractive()` is true|

From e6e572eb2c316734a7db08e87a060b137d0fe410 Mon Sep 17 00:00:00 2001
From: Lilith Orion Hafner <lilithhafner@gmail.com>
Date: Thu, 28 Dec 2023 07:50:02 +0600
Subject: [PATCH 02/13] Temporarily remove failing sorting tests (#52643)

Tracked by issue #52642
---
 test/sorting.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/sorting.jl b/test/sorting.jl
index 621c82ae7e7fe..35635d8588f93 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -1061,14 +1061,15 @@ end
     # it would be too much trouble to actually construct a valid pathological input, so we
     # construct an invalid pathological input.
     # This test is kind of sketchy because it passes invalid inputs to the function
-    for i in [1:6, 1:483, 1:957, 77:86, 118:478, 223:227, 231:970, 317:958, 500:501, 500:501, 500:501, 614:620, 632:635, 658:665, 933:940, 937:942, 997:1000, 999:1000]
-        x = rand(1:5, 1000)
-        @test partialsort(x, i, lt=(<=)) == sort(x)[i]
-    end
-    for i in [1, 7, 8, 490, 495, 852, 993, 996, 1000]
-        x = rand(1:5, 1000)
-        @test partialsort(x, i, lt=(<=)) == sort(x)[i]
-    end
+    # Temporarily removed due to flakey test failures. See #52642 for details.
+    # for i in [1:6, 1:483, 1:957, 77:86, 118:478, 223:227, 231:970, 317:958, 500:501, 500:501, 500:501, 614:620, 632:635, 658:665, 933:940, 937:942, 997:1000, 999:1000]
+    #     x = rand(1:5, 1000)
+    #     @test partialsort(x, i, lt=(<=)) == sort(x)[i]
+    # end
+    # for i in [1, 7, 8, 490, 495, 852, 993, 996, 1000]
+    #     x = rand(1:5, 1000)
+    #     @test partialsort(x, i, lt=(<=)) == sort(x)[i]
+    # end
 end
 
 # This testset is at the end of the file because it is slow.

From 26d04607c14615631e56973cd30b4fd04d3f4af8 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Thu, 28 Dec 2023 02:52:18 -0500
Subject: [PATCH 03/13] Show more info on why package precompilation was needed
 (#52619)

---
 base/loading.jl | 53 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/base/loading.jl b/base/loading.jl
index 81778d41893c0..1519375099634 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -1633,11 +1633,11 @@ end
 
 # returns `nothing` if require found a precompile cache for this sourcepath, but couldn't load it
 # returns the set of modules restored if the cache load succeeded
-@constprop :none function _require_search_from_serialized(pkg::PkgId, sourcepath::String, build_id::UInt128)
+@constprop :none function _require_search_from_serialized(pkg::PkgId, sourcepath::String, build_id::UInt128; reasons=nothing)
     assert_havelock(require_lock)
     paths = find_all_in_cache_path(pkg)
     for path_to_try in paths::Vector{String}
-        staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try)
+        staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try; reasons)
         if staledeps === true
             continue
         end
@@ -2126,11 +2126,11 @@ function _require(pkg::PkgId, env=nothing)
         set_pkgorigin_version_path(pkg, path)
 
         pkg_precompile_attempted = false # being safe to avoid getting stuck in a Pkg.precompile loop
-
+        reasons = Dict{String,Int}()
         # attempt to load the module file via the precompile cache locations
         if JLOptions().use_compiled_modules != 0
             @label load_from_cache
-            m = _require_search_from_serialized(pkg, path, UInt128(0))
+            m = _require_search_from_serialized(pkg, path, UInt128(0); reasons)
             if m isa Module
                 return m
             end
@@ -2166,7 +2166,7 @@ function _require(pkg::PkgId, env=nothing)
                     # double-check now that we have lock
                     m = _require_search_from_serialized(pkg, path, UInt128(0))
                     m isa Module && return m
-                    compilecache(pkg, path)
+                    compilecache(pkg, path; reasons)
                 end
                 cachefile_or_module isa Module && return cachefile_or_module::Module
                 cachefile = cachefile_or_module
@@ -2568,17 +2568,17 @@ This can be used to reduce package load times. Cache files are stored in
 `DEPOT_PATH[1]/compiled`. See [Module initialization and precompilation](@ref)
 for important notes.
 """
-function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout)
+function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout; reasons::Union{Dict{String,Int},Nothing}=nothing)
     @nospecialize internal_stderr internal_stdout
     path = locate_package(pkg)
     path === nothing && throw(ArgumentError("$pkg not found during precompilation"))
-    return compilecache(pkg, path, internal_stderr, internal_stdout)
+    return compilecache(pkg, path, internal_stderr, internal_stdout; reasons)
 end
 
 const MAX_NUM_PRECOMPILE_FILES = Ref(10)
 
 function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, internal_stdout::IO = stdout,
-                      keep_loaded_modules::Bool = true)
+                      keep_loaded_modules::Bool = true; reasons::Union{Dict{String,Int},Nothing}=nothing)
 
     @nospecialize internal_stderr internal_stdout
     # decide where to put the resulting cache file
@@ -2595,7 +2595,7 @@ function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, in
     end
     # run the expression and cache the result
     verbosity = isinteractive() ? CoreLogging.Info : CoreLogging.Debug
-    @logmsg verbosity "Precompiling $pkg"
+    @logmsg verbosity "Precompiling $pkg $(list_reasons(reasons))"
 
     # create a temporary file in `cachepath` directory, write the cache in it,
     # write the checksum, _and then_ atomically move the file to `cachefile`.
@@ -3311,17 +3311,29 @@ function maybe_cachefile_lock(f, pkg::PkgId, srcpath::String; stale_age=compilec
         f()
     end
 end
+
+function record_reason(reasons::Dict{String,Int}, reason::String)
+    reasons[reason] = get(reasons, reason, 0) + 1
+end
+record_reason(::Nothing, ::String) = nothing
+function list_reasons(reasons::Dict{String,Int})
+    isempty(reasons) && return ""
+    return "(cache misses: $(join(("$k ($v)" for (k,v) in reasons), ", ")))"
+end
+
 # returns true if it "cachefile.ji" is stale relative to "modpath.jl" and build_id for modkey
 # otherwise returns the list of dependencies to also check
-@constprop :none function stale_cachefile(modpath::String, cachefile::String; ignore_loaded::Bool = false)
+@constprop :none function stale_cachefile(modpath::String, cachefile::String; ignore_loaded::Bool = false, reasons=nothing)
     return stale_cachefile(PkgId(""), UInt128(0), modpath, cachefile; ignore_loaded)
 end
-@constprop :none function stale_cachefile(modkey::PkgId, build_id::UInt128, modpath::String, cachefile::String; ignore_loaded::Bool = false)
+@constprop :none function stale_cachefile(modkey::PkgId, build_id::UInt128, modpath::String, cachefile::String;
+                                            ignore_loaded::Bool = false, reasons::Union{Dict{String,Int},Nothing}=nothing)
     io = open(cachefile, "r")
     try
         checksum = isvalid_cache_header(io)
         if iszero(checksum)
             @debug "Rejecting cache file $cachefile due to it containing an invalid cache header"
+            record_reason(reasons, "invalid header")
             return true # invalid cache file
         end
         modules, (includes, _, requires), required_modules, srctextpos, prefs, prefs_hash, clone_targets, flags = parse_cache_header(io, cachefile)
@@ -3334,6 +3346,7 @@ end
               current session: $(CacheFlags())
               cache file:      $(CacheFlags(flags))
             """
+            record_reason(reasons, "mismatched flags")
             return true
         end
         pkgimage = !isempty(clone_targets)
@@ -3342,6 +3355,7 @@ end
             if JLOptions().use_pkgimages == 0
                 # presence of clone_targets means native code cache
                 @debug "Rejecting cache file $cachefile for $modkey since it would require usage of pkgimage"
+                record_reason(reasons, "requires pkgimages")
                 return true
             end
             rejection_reasons = check_clone_targets(clone_targets)
@@ -3350,10 +3364,12 @@ end
                     Reasons=rejection_reasons,
                     var"Image Targets"=parse_image_targets(clone_targets),
                     var"Current Targets"=current_image_targets())
+                record_reason(reasons, "target mismatch")
                 return true
             end
             if !isfile(ocachefile)
                 @debug "Rejecting cache file $cachefile for $modkey since pkgimage $ocachefile was not found"
+                record_reason(reasons, "missing ocachefile")
                 return true
             end
         else
@@ -3362,12 +3378,14 @@ end
         id = first(modules)
         if id.first != modkey && modkey != PkgId("")
             @debug "Rejecting cache file $cachefile for $modkey since it is for $id instead"
+            record_reason(reasons, "for different pkgid")
             return true
         end
         if build_id != UInt128(0)
             id_build = (UInt128(checksum) << 64) | id.second
             if id_build != build_id
                 @debug "Ignoring cache file $cachefile for $modkey ($((UUID(id_build)))) since it does not provide desired build_id ($((UUID(build_id))))"
+                record_reason(reasons, "for different buildid")
                 return true
             end
         end
@@ -3389,6 +3407,7 @@ end
                     @goto locate_branch
                 else
                     @debug "Rejecting cache file $cachefile because module $req_key is already loaded and incompatible."
+                    record_reason(reasons, req_key == PkgId(Core) ? "wrong julia version" : "wrong dep version loaded")
                     return true # Won't be able to fulfill dependency
                 end
             else
@@ -3396,6 +3415,7 @@ end
                 path = locate_package(req_key)
                 if path === nothing
                     @debug "Rejecting cache file $cachefile because dependency $req_key not found."
+                    record_reason(reasons, "dep missing source")
                     return true # Won't be able to fulfill dependency
                 end
                 depmods[i] = (path, req_key, req_build_id)
@@ -3415,6 +3435,7 @@ end
                     break
                 end
                 @debug "Rejecting cache file $cachefile because it provides the wrong build_id (got $((UUID(build_id)))) for $req_key (want $(UUID(req_build_id)))"
+                record_reason(reasons, "wrong dep buildid")
                 return true # cachefile doesn't provide the required version of the dependency
             end
         end
@@ -3423,6 +3444,7 @@ end
         if !skip_check
             if !samefile(includes[1].filename, modpath) && !samefile(fixup_stdlib_path(includes[1].filename), modpath)
                 @debug "Rejecting cache file $cachefile because it is for file $(includes[1].filename) not file $modpath"
+                record_reason(reasons, "wrong source")
                 return true # cache file was compiled from a different path
             end
             for (modkey, req_modkey) in requires
@@ -3430,6 +3452,7 @@ end
                 pkg = identify_package(modkey, req_modkey.name)
                 if pkg != req_modkey
                     @debug "Rejecting cache file $cachefile because uuid mapping for $modkey => $req_modkey has changed, expected $modkey => $pkg"
+                    record_reason(reasons, "dep uuid changed")
                     return true
                 end
             end
@@ -3437,6 +3460,7 @@ end
                 f, fsize_req, hash_req, ftime_req = chi.filename, chi.fsize, chi.hash, chi.mtime
                 if startswith(f, "@depot/")
                     @debug("Rejecting stale cache file $cachefile because its depot could not be resolved")
+                    record_reason(reasons, "nonresolveable depot")
                     return true
                 end
                 if !ispath(f)
@@ -3445,6 +3469,7 @@ end
                         continue
                     end
                     @debug "Rejecting stale cache file $cachefile because file $f does not exist"
+                    record_reason(reasons, "missing sourcefile")
                     return true
                 end
                 if ftime_req >= 0.0
@@ -3458,17 +3483,20 @@ end
                                !( 0 < (ftime_req - ftime) < 1e-6 )        # PR #45552: Compensate for Windows tar giving mtimes that may be incorrect by up to one microsecond
                     if is_stale
                         @debug "Rejecting stale cache file $cachefile because mtime of include_dependency $f has changed (mtime $ftime, before $ftime_req)"
+                        record_reason(reasons, "include_dependency mtime change")
                         return true
                     end
                 else
                     fsize = filesize(f)
                     if fsize != fsize_req
                         @debug "Rejecting stale cache file $cachefile because file size of $f has changed (file size $fsize, before $fsize_req)"
+                        record_reason(reasons, "include_dependency fsize change")
                         return true
                     end
                     hash = open(_crc32c, f, "r")
                     if hash != hash_req
                         @debug "Rejecting stale cache file $cachefile because hash of $f has changed (hash $hash, before $hash_req)"
+                        record_reason(reasons, "include_dependency fhash change")
                         return true
                     end
                 end
@@ -3477,12 +3505,14 @@ end
 
         if !isvalid_file_crc(io)
             @debug "Rejecting cache file $cachefile because it has an invalid checksum"
+            record_reason(reasons, "invalid checksum")
             return true
         end
 
         if pkgimage
             if !isvalid_pkgimage_crc(io, ocachefile::String)
                 @debug "Rejecting cache file $cachefile because $ocachefile has an invalid checksum"
+                record_reason(reasons, "ocachefile invalid checksum")
                 return true
             end
         end
@@ -3490,6 +3520,7 @@ end
         curr_prefs_hash = get_preferences_hash(id.uuid, prefs)
         if prefs_hash != curr_prefs_hash
             @debug "Rejecting cache file $cachefile because preferences hash does not match 0x$(string(prefs_hash, base=16)) != 0x$(string(curr_prefs_hash, base=16))"
+            record_reason(reasons, "preferences hash mismatch")
             return true
         end
 

From e96c13aa5b2e2b41e618ac009d43e1fdea0a70d4 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Thu, 28 Dec 2023 07:42:28 -0500
Subject: [PATCH 04/13] update nthreads info in versioninfo (#52423)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes https://github.com/JuliaLang/julia/issues/52404
@nilshg I opted to make it one line as it fits.
```
julia> versioninfo()
Julia Version 1.11.0-DEV.1011
Commit bb7091c6f2* (2023-12-04 14:58 UTC)
Platform Info:
  OS: macOS (arm64-apple-darwin23.0.0)
  CPU: 10 × Apple M2 Pro
  WORD_SIZE: 64
  LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1)
  Threads: 1 default, 0 interactive, 1 GC (on 6 virtual cores)
Environment:
  JULIA_EDITOR = code
```
---
 stdlib/InteractiveUtils/src/InteractiveUtils.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
index 629cf88cd8ce6..9927af1a4db9a 100644
--- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl
+++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
@@ -160,7 +160,8 @@ function versioninfo(io::IO=stdout; verbose::Bool=false)
     end
     println(io, "  WORD_SIZE: ", Sys.WORD_SIZE)
     println(io, "  LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")")
-    println(io, "  Threads: ", Threads.maxthreadid(), " on ", Sys.CPU_THREADS, " virtual cores")
+    println(io, """Threads: $(Threads.nthreads(:default)) default, $(Threads.nthreads(:interactive)) interactive, \
+      $(Threads.ngcthreads()) GC (on $(Sys.CPU_THREADS) virtual cores)""")
 
     function is_nonverbose_env(k::String)
         return occursin(r"^JULIA_|^DYLD_|^LD_", k)

From 90ae544fb57acdebd5b6ecd592ebd68c3ee629ed Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 28 Dec 2023 14:18:07 -0500
Subject: [PATCH 05/13] fix typo in NEWS (#52652)

Typo from #52461.
---
 NEWS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 958c51e3f840f..d5ef405b7a913 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -84,9 +84,9 @@ New library features
   write the output to a stream rather than returning a string ([#48625]).
 * `sizehint!(s, n)` now supports an optional `shrink` argument to disable shrinking ([#51929]).
 * New function `Docs.hasdoc(module, symbol)` tells whether a name has a docstring ([#52139]).
-* Passing an IOBuffer as a stdout argument for Process spawn now works as
+* Passing an `IOBuffer` as a stdout argument for `Process` spawn now works as
   expected, synchronized with `wait` or `success`, so a `Base.BufferStream` is
-  no longer required there for correctness to avoid data-races ([#TBD]).
+  no longer required there for correctness to avoid data races ([#52461]).
 * After a process exits, `closewrite` will no longer be automatically called on
   the stream passed to it. Call `wait` on the process instead to ensure the
   content is fully written, then call `closewrite` manually to avoid

From ad3769e86d8657ea5cbb566c8826f7c0dbf10696 Mon Sep 17 00:00:00 2001
From: Neven Sajko <s@purelymail.com>
Date: Thu, 28 Dec 2023 22:50:59 +0100
Subject: [PATCH 06/13] `Base`: make `Tuple(::Pair)` type-stable (#52650)

Fixes #52636
---
 base/tuple.jl | 1 +
 test/tuple.jl | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/base/tuple.jl b/base/tuple.jl
index 2e1c5972c407d..19b88234cbb00 100644
--- a/base/tuple.jl
+++ b/base/tuple.jl
@@ -417,6 +417,7 @@ _totuple(::Type{Tuple}, itr, s...) = (collect(Iterators.rest(itr,s...))...,)
 _totuple(::Type{Tuple}, itr::Array) = (itr...,)
 _totuple(::Type{Tuple}, itr::SimpleVector) = (itr...,)
 _totuple(::Type{Tuple}, itr::NamedTuple) = (itr...,)
+_totuple(::Type{Tuple}, p::Pair) = (p.first, p.second)
 _totuple(::Type{Tuple}, x::Number) = (x,) # to make Tuple(x) inferable
 
 end
diff --git a/test/tuple.jl b/test/tuple.jl
index b806667fd9d0a..3a8fb51051737 100644
--- a/test/tuple.jl
+++ b/test/tuple.jl
@@ -807,3 +807,9 @@ namedtup = (;a=1, b=2, c=3)
 @test Val{Tuple{Vararg{T,4}} where T} === Val{Tuple{T,T,T,T} where T}
 @test Val{Tuple{Int64, Vararg{Int32,N}} where N} === Val{Tuple{Int64, Vararg{Int32}}}
 @test Val{Tuple{Int32, Vararg{Int64}}} === Val{Tuple{Int32, Vararg{Int64,N}} where N}
+
+@testset "from Pair, issue #52636" begin
+    pair = (1 => "2")
+    @test (1, "2") == @inferred Tuple(pair)
+    @test (1, "2") == @inferred Tuple{Int,String}(pair)
+end

From e8f89682d7b434f1159626a213756b3691f48d03 Mon Sep 17 00:00:00 2001
From: Bhuminjay Soni <76656712+11happy@users.noreply.github.com>
Date: Fri, 29 Dec 2023 11:03:19 +0530
Subject: [PATCH 07/13] Added Tests for Permute function in combinatorics.jl
 (#52648)

Signed-off-by: happy <happy@Ubunutt.myguest.virtualbox.org>
Co-authored-by: happy <happy@Ubunutt.myguest.virtualbox.org>
---
 test/combinatorics.jl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/combinatorics.jl b/test/combinatorics.jl
index 25a444b70ec36..862e3bfa37e1e 100644
--- a/test/combinatorics.jl
+++ b/test/combinatorics.jl
@@ -129,3 +129,24 @@ end
         end
     end
 end
+
+@testset "permute!" begin
+    #simple array
+    @test permute!([1,2,3,4,5],[3,2,1,5,4]) == [3,2,1,5,4]
+    #empty array
+    @test permute!([],[]) == []
+    #single-element array
+    @test permute!([5],[1]) == [5]
+    #repeated elements in array
+    @test permute!([1,2,2,3,3,3],[2,1,3,5,4,6]) == [2,1,2,3,3,3]
+    #permutation vector contains zero
+    @test_throws BoundsError permute!([1,2,3],[0,1,2])
+    #permutation vector contains negative indices
+    @test_throws BoundsError permute!([1,2,3],[2,-1,1])
+    #permutation vector contains indices larger than array size
+    @test_throws BoundsError permute!([1,2,3],[2,4,1])
+    #permutation vector is empty
+    @test_throws DimensionMismatch permute!([1,2,3],[])
+    #array is empty
+    @test_throws BoundsError permute!([],[2,1])
+end

From 2b2f5344138f2b50ee91132592733743f33c5358 Mon Sep 17 00:00:00 2001
From: Diogo Netto <61364108+d-netto@users.noreply.github.com>
Date: Fri, 29 Dec 2023 20:37:46 -0300
Subject: [PATCH 08/13] minor fix in malloc terminology used in docs (#52665)

---
 doc/src/manual/performance-tips.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index 68ee5132f8592..579413e808f7b 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -94,8 +94,8 @@ a vector of 64-bit floats so there should be no need to allocate (heap) memory.
 
 We should clarify that what `@time` reports is specifically *heap* allocations, which are typically needed for either
 mutable objects or for creating/growing variable-sized containers (such as `Array` or `Dict`, strings, or "type-unstable"
-objects whose type is only known at runtime).  Allocating (or deallocating) such blocks of memory may require an expensive
-system call (e.g. via `malloc` in C), and they must be tracked for garbage collection.  In contrast, immutable values like
+objects whose type is only known at runtime).  Allocating (or deallocating) such blocks of memory may require an expensive function
+call to libc (e.g. via `malloc` in C), and they must be tracked for garbage collection.  In contrast, immutable values like
 numbers (except bignums), tuples, and immutable `struct`s can be stored much more cheaply, e.g. in stack or CPU-register
 memory, so one doesn’t typically worry about the performance cost of "allocating" them.
 

From 3f4cfc6d19bfe684223c3f27771d1839eafe6aa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <giordano@users.noreply.github.com>
Date: Sat, 30 Dec 2023 10:04:07 +0100
Subject: [PATCH 09/13] [dSFMT_jll] Upgrade to v2.2.5 (#52667)

Usual memo to self:

* update version number in `stdlib/dSFMT_jll/Project.toml`
* refresh checksums with `make -f contrib/refresh_checksums.mk -j dsfmt`
* update version number in `deps/checksums/dsfmt`
---
 deps/checksums/dsfmt          | 68 +++++++++++++++++------------------
 deps/dsfmt.version            |  4 ++-
 stdlib/dSFMT_jll/Project.toml |  2 +-
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/deps/checksums/dsfmt b/deps/checksums/dsfmt
index 63c7e26f0eb43..0666e51efa994 100644
--- a/deps/checksums/dsfmt
+++ b/deps/checksums/dsfmt
@@ -1,34 +1,34 @@
-dSFMT.v2.2.4+4.aarch64-apple-darwin.tar.gz/md5/43b52709b7794c92931286174854c886
-dSFMT.v2.2.4+4.aarch64-apple-darwin.tar.gz/sha512/018b67a06cdf42dda2a906025e8a12e026af9b39fe8281890dc90d66a422c3af2a8430d42677f79d123fd0ab0e8d5c37db2e0a00ef03731d35cbb65f9e59b108
-dSFMT.v2.2.4+4.aarch64-linux-gnu.tar.gz/md5/260e14855dbc7773a2ca906d58cc57f2
-dSFMT.v2.2.4+4.aarch64-linux-gnu.tar.gz/sha512/820ca4c6afde931e855b74015150f4ffbb513276c3fa7dbcc1ec8d34c02d4989fb7424a6e4f81f93d054811b5f54f8633d955b05acdb088387ee90f1c3b00915
-dSFMT.v2.2.4+4.aarch64-linux-musl.tar.gz/md5/7ddccbad6b5c9de4be187fe76637a0d8
-dSFMT.v2.2.4+4.aarch64-linux-musl.tar.gz/sha512/e3c225da00927096e3a6cd4abc681fba8f469cb74828e7054d4f5684d71dcb8e75c9a81f14fa10bfbb78f62f9567a31a92edcca8d797e5810a2a44a3fc17bc84
-dSFMT.v2.2.4+4.armv6l-linux-gnueabihf.tar.gz/md5/a70329e0a6c57009c6b6950fd34089f6
-dSFMT.v2.2.4+4.armv6l-linux-gnueabihf.tar.gz/sha512/4418c42165660adc050e872ef834f920c89ed6a0d2b816821672b1e862e947aad7efd023289da9bf05bb2eb9ec4b9d2561c403e2d5384d5314a4ba016b1f9cfc
-dSFMT.v2.2.4+4.armv6l-linux-musleabihf.tar.gz/md5/6ffc798b8a0c847fa5cb93640bd66ab3
-dSFMT.v2.2.4+4.armv6l-linux-musleabihf.tar.gz/sha512/94e5ae07d0b1420abd7290519bce6f77deae634bbb4df31e3f02416bf509e555a9b1c9d19dd77ca76a308c2b86d5c9d4718b9ef83c13167b88a8181d8ca7e73a
-dSFMT.v2.2.4+4.armv7l-linux-gnueabihf.tar.gz/md5/660d95aa08580ca1716a89c4d8b1eb24
-dSFMT.v2.2.4+4.armv7l-linux-gnueabihf.tar.gz/sha512/bc757a9f805047be5375f92c10a3f3eab69345a4ec5cc997f763e66be36144a74d414ff926df8e17b9d5a2394189269c3188c55e0b7c75a72495394d65510cef
-dSFMT.v2.2.4+4.armv7l-linux-musleabihf.tar.gz/md5/78c487049092fe61949d506637c713bb
-dSFMT.v2.2.4+4.armv7l-linux-musleabihf.tar.gz/sha512/03ddada4478f05eab7d2971b2deaf2cba91f084d7ce66fc8219bcb3cf5c308ea13959fed95568ca80f4ce11794e197092984919265716de8f2558e2cb30d94ce
-dSFMT.v2.2.4+4.i686-linux-gnu.tar.gz/md5/b0f535336cca76f1dcdacca29c6f8410
-dSFMT.v2.2.4+4.i686-linux-gnu.tar.gz/sha512/cc03a246b32875037a41a45c1004834abc7c67f90bf17e1b41cc604ee9893147b1ca3978a2e103b94c94ac617380570473de1f66bff15de8e4ee05c5a3c21059
-dSFMT.v2.2.4+4.i686-linux-musl.tar.gz/md5/a61405f72c9a3bba5718f078c68e61a5
-dSFMT.v2.2.4+4.i686-linux-musl.tar.gz/sha512/726f130bbbfd0dece4185b89a25a73f3b5b950ebfb7f86aea6e9cbcf9ae932e591d20b854de0b4985103dbf8b4b7cb3560661c5070af971cd2c1f3ec3e1ea7d2
-dSFMT.v2.2.4+4.i686-w64-mingw32.tar.gz/md5/93670f43a98f7c6045427dc9ddd89a4a
-dSFMT.v2.2.4+4.i686-w64-mingw32.tar.gz/sha512/b76c2be073312ffec8c778b83d3e37b5d0c5dba770ffcc95a6ebfba516a948beb08419428192fcd5dda83357a64e90c4e3a40144688f128133400284a7363b8e
-dSFMT.v2.2.4+4.powerpc64le-linux-gnu.tar.gz/md5/fd8c73961ef7c82201e6d86e8bf4324c
-dSFMT.v2.2.4+4.powerpc64le-linux-gnu.tar.gz/sha512/1bd0ebd019cfc6f25f7ba007547c5ee297854655b93c55e90d8ead420875de5a087e38956693d5e901ff2abf667c72aa66fb34f587b82adf4b91b3d5d666b5c7
-dSFMT.v2.2.4+4.x86_64-apple-darwin.tar.gz/md5/b57ec1491ffdd40c72860b9f1869160c
-dSFMT.v2.2.4+4.x86_64-apple-darwin.tar.gz/sha512/c3a192dbcd3e768712d12d3ac851f46bfa1517eca16c9a187025553076c8fb886b925e4e3f5f20531180f72b73e7eaa5281f54d5b7d4e6a4d53f4542c4bb33b6
-dSFMT.v2.2.4+4.x86_64-linux-gnu.tar.gz/md5/fa671f4ca14b171d53c8866d03f9162a
-dSFMT.v2.2.4+4.x86_64-linux-gnu.tar.gz/sha512/2e242a1448da0508ea88cc1a106f1e74f8d7e7562cd82b80d86abf9a8b454653ad7612e25c30ce00c23757e8a5b7b5736253b00a52f9473af6c5d4df768138f2
-dSFMT.v2.2.4+4.x86_64-linux-musl.tar.gz/md5/c648294163882ec539ab646542c74880
-dSFMT.v2.2.4+4.x86_64-linux-musl.tar.gz/sha512/9e96a47d660854b6517364f0db40a2f4e0e3b814499a0349f7cf550b1c8d04589fca5eb4a75bf34f36d1b5d1b2277b3e9a961c887092abedd08f438e025329e7
-dSFMT.v2.2.4+4.x86_64-unknown-freebsd.tar.gz/md5/b4497d34d72ce134ce110b6185a82393
-dSFMT.v2.2.4+4.x86_64-unknown-freebsd.tar.gz/sha512/23d0fb273edbb5a08920a3683398e11d6f4df137dabcfc5f395a9175ddf14ab8999eb961ae8f4b76715a5a2dd2b77757f752abce35c1f752b800201e93aae874
-dSFMT.v2.2.4+4.x86_64-w64-mingw32.tar.gz/md5/d380963292bc54d27d39a3f94adbd5ac
-dSFMT.v2.2.4+4.x86_64-w64-mingw32.tar.gz/sha512/ef2f99b17b1a36e61fb4d149d8a8fccc9e804b3b727f2426fca917c265c2d7ada4e3abaa5383e25136dec8de262c1d11970a01d7cfb513a55f1d86a23534e864
-dsfmt-2.2.4.tar.gz/md5/ed30e63552d62df48d709dde4f755660
-dsfmt-2.2.4.tar.gz/sha512/fe84e986cbf198172340adfac0436b08f087643eca3f1ceccacde146cbfd8c41e3eb0dfbb062f7ca5f462db13c386abd7c269bc0cbefc9a0ecf97a8a8870a2e4
+dSFMT.v2.2.5+0.aarch64-apple-darwin.tar.gz/md5/36284767f523bb297633d7da17a7db5a
+dSFMT.v2.2.5+0.aarch64-apple-darwin.tar.gz/sha512/e6434c154db4c7187f227a550b159a8db8cfffc514323ca31112744a80a007ba5c95f2274cee30c0aa8caf1b20fb643cb814651a622b8e4bb2e5652878e504d2
+dSFMT.v2.2.5+0.aarch64-linux-gnu.tar.gz/md5/260e14855dbc7773a2ca906d58cc57f2
+dSFMT.v2.2.5+0.aarch64-linux-gnu.tar.gz/sha512/820ca4c6afde931e855b74015150f4ffbb513276c3fa7dbcc1ec8d34c02d4989fb7424a6e4f81f93d054811b5f54f8633d955b05acdb088387ee90f1c3b00915
+dSFMT.v2.2.5+0.aarch64-linux-musl.tar.gz/md5/7ddccbad6b5c9de4be187fe76637a0d8
+dSFMT.v2.2.5+0.aarch64-linux-musl.tar.gz/sha512/e3c225da00927096e3a6cd4abc681fba8f469cb74828e7054d4f5684d71dcb8e75c9a81f14fa10bfbb78f62f9567a31a92edcca8d797e5810a2a44a3fc17bc84
+dSFMT.v2.2.5+0.armv6l-linux-gnueabihf.tar.gz/md5/a70329e0a6c57009c6b6950fd34089f6
+dSFMT.v2.2.5+0.armv6l-linux-gnueabihf.tar.gz/sha512/4418c42165660adc050e872ef834f920c89ed6a0d2b816821672b1e862e947aad7efd023289da9bf05bb2eb9ec4b9d2561c403e2d5384d5314a4ba016b1f9cfc
+dSFMT.v2.2.5+0.armv6l-linux-musleabihf.tar.gz/md5/6ffc798b8a0c847fa5cb93640bd66ab3
+dSFMT.v2.2.5+0.armv6l-linux-musleabihf.tar.gz/sha512/94e5ae07d0b1420abd7290519bce6f77deae634bbb4df31e3f02416bf509e555a9b1c9d19dd77ca76a308c2b86d5c9d4718b9ef83c13167b88a8181d8ca7e73a
+dSFMT.v2.2.5+0.armv7l-linux-gnueabihf.tar.gz/md5/660d95aa08580ca1716a89c4d8b1eb24
+dSFMT.v2.2.5+0.armv7l-linux-gnueabihf.tar.gz/sha512/bc757a9f805047be5375f92c10a3f3eab69345a4ec5cc997f763e66be36144a74d414ff926df8e17b9d5a2394189269c3188c55e0b7c75a72495394d65510cef
+dSFMT.v2.2.5+0.armv7l-linux-musleabihf.tar.gz/md5/78c487049092fe61949d506637c713bb
+dSFMT.v2.2.5+0.armv7l-linux-musleabihf.tar.gz/sha512/03ddada4478f05eab7d2971b2deaf2cba91f084d7ce66fc8219bcb3cf5c308ea13959fed95568ca80f4ce11794e197092984919265716de8f2558e2cb30d94ce
+dSFMT.v2.2.5+0.i686-linux-gnu.tar.gz/md5/11463fd3981a8c143d7aed691d18d4e0
+dSFMT.v2.2.5+0.i686-linux-gnu.tar.gz/sha512/db946a4fbd8a3163b8b1c25e02bfc4a841da7d2532892a99037bd48ac98e1840691e8cc0127d9457a82667a0131e4826cb4e9d0a13f127afc62da4eb68af5a3e
+dSFMT.v2.2.5+0.i686-linux-musl.tar.gz/md5/a61405f72c9a3bba5718f078c68e61a5
+dSFMT.v2.2.5+0.i686-linux-musl.tar.gz/sha512/726f130bbbfd0dece4185b89a25a73f3b5b950ebfb7f86aea6e9cbcf9ae932e591d20b854de0b4985103dbf8b4b7cb3560661c5070af971cd2c1f3ec3e1ea7d2
+dSFMT.v2.2.5+0.i686-w64-mingw32.tar.gz/md5/3bc27ef8f26c7a26f096cf1d558d408d
+dSFMT.v2.2.5+0.i686-w64-mingw32.tar.gz/sha512/ea3608d3ae3874ea57a1a08f69abe2a1638bc340db71c6fe3c4fd5637d8c54943bf16b099a46817387c1ed4cb5f3cd1c0ff19ae8a4ed85dd555555821af06374
+dSFMT.v2.2.5+0.powerpc64le-linux-gnu.tar.gz/md5/fd8c73961ef7c82201e6d86e8bf4324c
+dSFMT.v2.2.5+0.powerpc64le-linux-gnu.tar.gz/sha512/1bd0ebd019cfc6f25f7ba007547c5ee297854655b93c55e90d8ead420875de5a087e38956693d5e901ff2abf667c72aa66fb34f587b82adf4b91b3d5d666b5c7
+dSFMT.v2.2.5+0.x86_64-apple-darwin.tar.gz/md5/6be9f2d3cd8d45a3fc1c3feeebcbbf00
+dSFMT.v2.2.5+0.x86_64-apple-darwin.tar.gz/sha512/5d17c2c0eedad6739b41b8a613e9e452df484136ecd11aed1f6b9f426ae1deaef9faf721810080ebc1701a88a3e7ae91b1992893598c33b342c3f876661f2f8e
+dSFMT.v2.2.5+0.x86_64-linux-gnu.tar.gz/md5/fa671f4ca14b171d53c8866d03f9162a
+dSFMT.v2.2.5+0.x86_64-linux-gnu.tar.gz/sha512/2e242a1448da0508ea88cc1a106f1e74f8d7e7562cd82b80d86abf9a8b454653ad7612e25c30ce00c23757e8a5b7b5736253b00a52f9473af6c5d4df768138f2
+dSFMT.v2.2.5+0.x86_64-linux-musl.tar.gz/md5/c648294163882ec539ab646542c74880
+dSFMT.v2.2.5+0.x86_64-linux-musl.tar.gz/sha512/9e96a47d660854b6517364f0db40a2f4e0e3b814499a0349f7cf550b1c8d04589fca5eb4a75bf34f36d1b5d1b2277b3e9a961c887092abedd08f438e025329e7
+dSFMT.v2.2.5+0.x86_64-unknown-freebsd.tar.gz/md5/5b53e6c5b78102f563742b4b3d888ec6
+dSFMT.v2.2.5+0.x86_64-unknown-freebsd.tar.gz/sha512/5db5902c7ec2624add768b9e2866f9aac224a31bcb4114d450c45717e2b244521b7c511c059527d557a71639cff98e190a38cd3e28db5be0b1faf0a1762cb1a5
+dSFMT.v2.2.5+0.x86_64-w64-mingw32.tar.gz/md5/386adb3b7593c222dc7a1060a1356b21
+dSFMT.v2.2.5+0.x86_64-w64-mingw32.tar.gz/sha512/fe2ab5021126807b37042e89a22ef9a869c6a0a028680df445773b2affd11c2b02148be07d53504ea3842bb38bb62fe039529688266c1cba3545a892bd4dc185
+dsfmt-2.2.5.tar.gz/md5/d22e476b52cdee7d5b90d2f289570073
+dsfmt-2.2.5.tar.gz/sha512/951e8669350f750b8915a819e704eae0a9b9c9518b3e3b9a1905f9ca0d25cc4c2486cb479e258a4a114e9c26ceb73a6c4e9f1cc02ed19173aeb8f20189754f6b
diff --git a/deps/dsfmt.version b/deps/dsfmt.version
index bbb63417f46cd..d81db2d10ff09 100644
--- a/deps/dsfmt.version
+++ b/deps/dsfmt.version
@@ -1,5 +1,7 @@
+# -*- makefile -*-
+
 ## jll artifact
 DSFMT_JLL_NAME := dSFMT
 
 ## source build
-DSFMT_VER := 2.2.4
+DSFMT_VER := 2.2.5
diff --git a/stdlib/dSFMT_jll/Project.toml b/stdlib/dSFMT_jll/Project.toml
index a83775f625987..0db19e602f67b 100644
--- a/stdlib/dSFMT_jll/Project.toml
+++ b/stdlib/dSFMT_jll/Project.toml
@@ -1,6 +1,6 @@
 name = "dSFMT_jll"
 uuid = "05ff407c-b0c1-5878-9df8-858cc2e60c36"
-version = "2.2.4+4"
+version = "2.2.5+0"
 
 [deps]
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

From 9deee4619209c997077e843b7549abc292884819 Mon Sep 17 00:00:00 2001
From: djturizo <72823727+djturizo@users.noreply.github.com>
Date: Sat, 30 Dec 2023 05:58:31 -0500
Subject: [PATCH 10/13] Bunch-Kaufman factorization support for generic number
 types and inertia computations (#51487)

### Introduction
This PR adds a generic implementation of the Bunch-Kaufman factorization
in native Julia code, and a generic implementation of a inertia
calculation function. Right now Julia only support the Bunch-Kaufman
factorization for `Float32`, `Float64` and their complex variants. This
is because the factorization is handled by LAPACK, which only supports
these types. To extend support to generic number types, I translated the
LAPACK implementation to native Julia code, and the code performs the
factorization in-place. I also included the function `inertia` that
computes the number of positive, negative, and zero eigenvalues of an $n
\times n$ Bunch-Kaufman factorized matrix in $\mathcal{O}(n)$ time.

### Changes
- `bunchkaufman` and `bunchkaufman!` now work for any `AbstractFloat`,
`Rational` and their complex variants. Behavior for previously supported
types is not changed (LAPACK is used when possible). `bunchakaufman!`
does not support `Integer` types, as in general the factorization lies
in the arithmetic closure of the number type (the rationals for the
integers). On the other hand, `bunchakaufman` supports `Integer` types,
by making an internal conversion to `Rational{BigInt}`.
- `ldiv!` for a `BunchKaufman` factorization has extended support for
generic number types with type stability.
- Previously, Julia extracted the diagonal factor of an $n \times n$
`BunchKaufman` object by making a copy of the matrix and then calling a
LAPACK function (`dsyconvf`, `csyconvf`, etc., depending on the number
type). This function also computes the triangular factor, so it runs in
$\mathcal{O}(n^2)$ time. Now Julia uses a native implementation of the
LAPACK function with small modifications, so it computes the diagonal
factor in $\mathcal{O}(n)$ time, without making a new copy of the
matrix.
- Added the function `inertia` that computes the number of positive,
negative and zero eigenvalues of the diagonal factor of an $n \times n$
`BunchKaufman` object, in case that the matrix is real symmetric or
Hermitian. For complex symmetric matrices, `inertia` only computes the
number of zero eigenvalues of the diagonal factor. `inertia` runs in
$\mathcal{O}(n)$ time and only uses arithmetic and real absolute value
operations. Therefore, `inertia` can be used for matrices of any generic
number type, including `Rational`. In particular, for rational matrices
the output of `inertia` is exact (unless a positive tolerance is
specified).
- Unit tests of the `BunchKaufman` library has been adapted to handle
low precision number types (approximate comparisons with tolerance
`sqrt(eps(Float64))` do not make sense when the floating point type is
`Float16`, for example). The test-set now also runs on the following
types: `Float16, Complex{Float16}, BigFloat, Complex{BigFloat},
Complex{Int}, BigInt, Complex{BigInt}, Rational{BigInt},
Complex{Rational{BigInt}}`. Unit tests for the `inertia` function have
been added too.
---
 NEWS.md                                   |    1 +
 stdlib/LinearAlgebra/src/bunchkaufman.jl  | 1208 ++++++++++++++++++++-
 stdlib/LinearAlgebra/test/bunchkaufman.jl |   81 +-
 3 files changed, 1263 insertions(+), 27 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d5ef405b7a913..82ae28a3b420d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -115,6 +115,7 @@ Standard library changes
    respectively, now efficiently compute the generalized eigenvalues (`eigen`: and eigenvectors) of `A` and `B`. Note: The second
    argument is the output of `bunchkaufman` or `lu` ([#50471]).
 * Structured matrices now retain either the axes of the parent (for `Symmetric`/`Hermitian`/`AbstractTriangular`/`UpperHessenberg`), or that of the principal diagonal (for banded matrices) ([#52480]).
+* `bunchkaufman` and `bunchkaufman!` now work for any `AbstractFloat`, `Rational` and their complex variants. `bunchkaufman` now supports `Integer` types, by making an internal conversion to `Rational{BigInt}`. Added new function `inertia` that computes the inertia of the diagonal factor given by the `BunchKaufman` factorization object of a real symmetric or Hermitian matrix. For complex symmetric matrices, `inertia` only computes the number of zero eigenvalues of the diagonal factor ([#51487]).
 
 #### Printf
 
diff --git a/stdlib/LinearAlgebra/src/bunchkaufman.jl b/stdlib/LinearAlgebra/src/bunchkaufman.jl
index f995b3b8444c4..19f82167495f3 100644
--- a/stdlib/LinearAlgebra/src/bunchkaufman.jl
+++ b/stdlib/LinearAlgebra/src/bunchkaufman.jl
@@ -4,6 +4,15 @@
 ## LD for BunchKaufman, UL for CholeskyDense, LU for LUDense and
 ## define size methods for Factorization types using it.
 
+##----------- Type utilities for generic Bunch-Kaufman implementation ------------
+# Generic real type. Any real number type should able to approximate
+# real numbers, and thus be closed under arithmetic operations.
+# Therefore so Int, Complex{Int}, etc. are excluded.
+ClosedReal = T where T <: Union{AbstractFloat, Rational}
+# Similarly, we also use a closed scalar type
+ClosedScalar = Union{T, Complex{T}} where T <: ClosedReal
+##--------------------------------------------------------------------------------
+
 """
     BunchKaufman <: Factorization
 
@@ -22,10 +31,10 @@ as appropriate given `S.uplo`, and `S.p`.
 
 # Examples
 ```jldoctest
-julia> A = [1 2; 2 3]
-2×2 Matrix{Int64}:
- 1  2
- 2  3
+julia> A = Float64.([1 2; 2 3])
+2×2 Matrix{Float64}:
+ 1.0  2.0
+ 2.0  3.0
 
 julia> S = bunchkaufman(A) # A gets wrapped internally by Symmetric(A)
 BunchKaufman{Float64, Matrix{Float64}, Vector{Int64}}
@@ -145,10 +154,10 @@ The following functions are available for `BunchKaufman` objects:
 
 # Examples
 ```jldoctest
-julia> A = [1 2; 2 3]
-2×2 Matrix{Int64}:
- 1  2
- 2  3
+julia> A = Float64.([1 2; 2 3])
+2×2 Matrix{Float64}:
+ 1.0  2.0
+ 2.0  3.0
 
 julia> S = bunchkaufman(A) # A gets wrapped internally by Symmetric(A)
 BunchKaufman{Float64, Matrix{Float64}, Vector{Int64}}
@@ -237,37 +246,47 @@ function _ipiv2perm_bk(v::AbstractVector{T}, maxi::Integer, uplo::AbstractChar,
     return p
 end
 
-function getproperty(B::BunchKaufman{T,<:StridedMatrix}, d::Symbol) where {T<:BlasFloat}
+function getproperty(B::BunchKaufman{TS},
+    d::Symbol) where TS <: ClosedScalar{TR} where TR <: ClosedReal
     n = size(B, 1)
     if d === :p
         return _ipiv2perm_bk(getfield(B, :ipiv), n, getfield(B, :uplo), B.rook)
     elseif d === :P
-        return Matrix{T}(I, n, n)[:,invperm(B.p)]
+        return Matrix{TS}(I, n, n)[:,invperm(B.p)]
     elseif d === :L || d === :U || d === :D
-        if getfield(B, :rook)
-            LUD, od = LAPACK.syconvf_rook!(getfield(B, :uplo), 'C', copy(getfield(B, :LD)), getfield(B, :ipiv))
+        if d === :D
+            _, od, md = generic_syconv(B, false)
+        elseif typeof(B) <: BunchKaufman{T,<:StridedMatrix} where {T<:BlasFloat}
+            # We use LAPACK whenever we can
+            if getfield(B, :rook)
+                LUD, _ = LAPACK.syconvf_rook!(getfield(B, :uplo), 'C',
+                    copy(getfield(B, :LD)), getfield(B, :ipiv))
+            else
+                LUD, _ = LAPACK.syconv!(getfield(B, :uplo), copy(getfield(B, :LD)),
+                    getfield(B, :ipiv))
+            end
         else
-            LUD, od = LAPACK.syconv!(getfield(B, :uplo), copy(getfield(B, :LD)), getfield(B, :ipiv))
+            LUD, _ = generic_syconv(B)
         end
         if d === :D
             if getfield(B, :uplo) == 'L'
                 odl = od[1:n - 1]
-                return Tridiagonal(odl, diag(LUD), getfield(B, :symmetric) ? odl : conj.(odl))
+                return Tridiagonal(odl, md, getfield(B, :symmetric) ? odl : conj.(odl))
             else # 'U'
                 odu = od[2:n]
-                return Tridiagonal(getfield(B, :symmetric) ? odu : conj.(odu), diag(LUD), odu)
+                return Tridiagonal(getfield(B, :symmetric) ? odu : conj.(odu), md, odu)
             end
         elseif d === :L
             if getfield(B, :uplo) == 'L'
                 return UnitLowerTriangular(LUD)
             else
-                throw(ArgumentError("factorization is U*D*transpose(U) but you requested L"))
+                throw(ArgumentError("factorization is U*D*U' but you requested L"))
             end
         else # :U
             if B.uplo == 'U'
                 return UnitUpperTriangular(LUD)
             else
-                throw(ArgumentError("factorization is L*D*transpose(L) but you requested U"))
+                throw(ArgumentError("factorization is L*D*L' but you requested U"))
             end
         end
     else
@@ -411,3 +430,1158 @@ end
 ## reconstruct the original matrix
 ## TODO: understand the procedure described at
 ## http://www.nag.com/numeric/FL/nagdoc_fl22/pdf/F07/f07mdf.pdf
+
+
+##--------------------------------------------------------------------------
+##------------- Start of generic Bunch-Kaufman Implementation --------------
+##--------------------------------------------------------------------------
+
+export inertia
+
+function arg_illegal(fun_name::AbstractString,
+    info::Integer,
+    waer::AbstractChar)
+    if waer == 'W'
+        @warn " ** On entry to '$(fun_name)' parameter number " *
+            "$(info) had an illegal value"
+    else
+        error(" ** On entry to '$(fun_name)' parameter number " *
+            "$(info) had an illegal value")
+    end
+end
+
+
+function cabs1(z::T) where T <: Complex
+    return abs(real(z)) + abs(imag(z))
+end
+
+
+function cabsr(z::T) where T <: Complex
+    return abs(real(z))
+end
+
+
+"""
+generic_adr1!(uplo, alpha, x, y, A, syhe) -> nothing
+
+`generic_adr1!` performs the following adjoint (symmetric or Hermitian)
+rank 1 operation
+
+`A[1:K,1:L] = alpha*x*y' + A[1:K,1:L]`
+
+in-place, where `alpha` is a scalar, `x` is a K element vector, `y`
+is an L element vector and `A` is an `NxM` matrix. Note that `y'` can
+denote either the transpose, i.e. `transpose(y)` or the conjugate
+transpose , i.e. `adjoint(y)`.
+
+`uplo` is a character, either `'U'`, `'L'` or `'F'`, indicating whether
+the matrix is stored in the upper triangular part (`uplo=='U'`), the
+lower triangular part (`uplo=='L'`), or the full storage space is used
+(`uplo=='F'`). If `uplo!='F'` then only the corresponding triangular
+part is updated. The values `'U'` or `'L'` can only be used when A is
+square (`N==M`).
+
+`syhe` is a character, either `'S'` or `'H'`, indicating whether the
+symmetric adjoint (`syhe=='S'`, and `y'==transpose(y)`) or the hermitian
+adjoint (`syhe=='H'`, and `y'==adjoint(y)`) must be used.
+"""
+function generic_adr1!(uplo::AbstractChar,
+    alpha::ClosedScalar{TR},
+    x::AbstractVector{TS},
+    y::AbstractVector{TS},
+    A::AbstractMatrix{TS},
+    syhe::AbstractChar
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Inputs must be 1-indexed; bounds may not be checked.
+    Base.require_one_based_indexing(x, A)
+
+    # Check argument validity
+    K = length(x)
+    L = length(y)
+    N, M = size(A)
+    info = 0::BlasInt
+    if (uplo != 'U' && uplo != 'L' && uplo != 'F') || (uplo != 'F' && N != M)
+        info = (-1)::BlasInt
+    elseif K > N
+        info = (-3)::BlasInt
+    elseif L > M
+        info = (-4)::BlasInt
+    elseif syhe != 'S' && syhe != 'H'
+        info = (-6)::BlasInt
+    end
+    if info < 0
+        arg_illegal("generic_sadr1!", -info, 'E')
+    end
+
+    # Load the requested adjoining operator
+    adj_op = syhe == 'S' ? identity : conj
+
+    # Define loop range function according to the type of storage
+    # TODO: can we adjust the range without anonymous functions,
+    # but without having to write the same code thrice?
+    i_range = uplo == 'F' ? _ -> (1:K) : uplo == 'U' ? j -> (1:min(j,K)) : j -> (j:K)
+
+    # Compute rank update of A
+    for j in 1:L; @inbounds begin
+        if y[j] != 0
+            temp = alpha * adj_op(y[j])
+            for i in i_range(j)
+                A[i,j] += x[i] * temp
+            end
+        end
+    end; end
+    return
+end
+
+
+"""
+generic_mvpv!(trans, alpha, A, x, beta, y) -> nothing
+
+`generic_mvpv!` performs the following matrix-vector operation:
+
+`y[1:K] = alpha*A'*x[1:L] + beta*y[1:K]`
+
+in-place, where `alpha` and `beta` are scalars, `x` is a vector with at
+least L elements, `y` is a vector with at least K elements, and `A` is
+an `NxM` matrix. `A'` can denote the transpose, i.e. `transpose(A)` or
+the conjugate transpose, i.e. `adjoint(A)`, and then `M==K && N==L`.
+`A'` can also denote no adjoining at all, i.e. `A'==A`, and then
+`N==K && M==L`.
+
+`trans` is a character, either `'T'`, `'C'` or `'N'`, indicating whether
+`A'=transpose(A)`, `A'=adjoint(A)` or `A'=A`, respectively.
+"""
+function generic_mvpv!(trans::AbstractChar,
+    alpha::ClosedScalar{TR},
+    A::AbstractMatrix{TS},
+    x::AbstractVector{TS},
+    beta::ClosedScalar{TR},
+    y::AbstractVector{TS},
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Inputs must be 1-indexed; bounds may not be checked.
+    Base.require_one_based_indexing(A, x, y)
+
+    # Check argument validity
+    M, N = size(A)
+    K = trans == 'N' ? M : N
+    L = trans == 'N' ? N : M
+    info = 0::BlasInt
+    if trans != 'T' && trans != 'C' && trans != 'N'
+        info = (-1)::BlasInt
+    elseif length(y) < K
+        info = (-3)::BlasInt
+    elseif length(x) < L
+        info = (-4)::BlasInt
+    end
+    if info < 0
+        arg_illegal("generic_sadr1!", -info, 'E')
+    end
+
+    #   Quick return if possible.
+    if K == 0 || (alpha == 0 && beta == 1); return; end
+
+    #   Start the operations. In this version the elements of A are
+    #   accessed sequentially with one pass through A.
+    #   First form  y := beta*y.
+    @inbounds begin
+        if beta != 1
+            if beta == 0
+                # Way less allocations and way faster for BigFloat.
+                # For Float64 there is some (acceptable IMO) performance loss.
+                y[1:K] .= 0
+            else
+                for i in 1:K; y[i] *= beta; end
+            end
+        end
+        if alpha == 0 || L == 0; return; end
+
+        if trans == 'N'
+            #   Form  y := alpha*A*x + y.
+            for j in 1:L
+                # Faster than a loop
+                axpy!(alpha*x[j], view(A, 1:K, j), view(y, 1:K))
+            end
+        else
+            #   Form  y := alpha*A**T*x + y  or  y := alpha*A**H*x + y.
+            noconj = (trans == 'T')
+            for i = 1:K
+                temp = 0
+                if noconj
+                    for j = 1:L
+                        temp = temp + A[j,i]*x[j]
+                    end
+                else
+                    for j = 1:L
+                        temp = temp + conj(A[j,i])*x[j]
+                    end
+                end
+                y[i] += alpha*temp
+            end
+        end
+    end
+    return
+end
+
+
+"""
+bk_rowcol_swap!(A, k, kp, kstep, upper, herm) -> did_swap::Bool
+
+Performs the row and column interchange of the Bunch-Kaufman factorization.
+If `upper==true` then the rows and columns `kp` of `A[1:k,1:k]` are
+interchanged with either rows and columns `k` or `k-1` of `A[1:k,1:k]`,
+depending on whether `kstep==1` or `kstep==2`, respectively. If
+`upper==false` then the rows and columns `kp-k+1` of `A[k:N,k:N]` are
+interchanged with either rows and columns `1` or `2` of `A[k:N,k:N]`,
+depending on whether `kstep==1` or `kstep==2`, respectively. `herm=true`
+then it is assumed that `A` is Hermitian, and conjugation is applied to
+the appropriate entries of the interchanged rows and columns. If
+`herm=false` no conjugation is performed.
+
+This is an internal helper function for the main Bunch-Kaufman
+factorization function, `generic_bunchkaufman!`. As such, validity of the
+input values is not verified.
+"""
+function bk_rowcol_swap!(
+    A::AbstractMatrix{TS},
+    k::Integer,
+    kp::Integer,
+    kstep::Integer,
+    upper::Bool,
+    herm::Bool
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    kk = upper ? k - kstep + 1 : k + kstep - 1
+    if kp != kk
+        if kp > 1
+            thisview = upper ? view(A, 1:(kp-1), :) : view(A, (kp+1):size(A,1), :)
+            Base.swapcols!(thisview, kp, kk)
+        end
+        thisrange = upper ? ((kp+1):(kk-1)) : ((kk+1):(kp-1))
+        if !herm
+            # Real/complex symmetric case
+            for j in thisrange
+                A[j,kk], A[kp,j] = A[kp,j], A[j,kk]
+            end
+            A[kk,kk], A[kp,kp] = A[kp,kp], A[kk,kk]
+        else
+            # Hermitian case
+            for j in thisrange
+                A[j,kk], A[kp,j] = conj(A[kp,j]), conj(A[j,kk])
+            end
+            A[kp,kk] = conj(A[kp,kk])
+            A[kk,kk], A[kp,kp] = real(A[kp,kp]), real(A[kk,kk])
+        end
+        if kstep == 2
+            if herm
+                # Force diagonal entry to be purely real
+                A[k,k] = real(A[k,k])
+            end
+            if upper
+                A[k-1,k], A[kp,k] = A[kp,k], A[k-1,k]
+            else
+                A[k+1,k], A[kp,k] = A[kp,k], A[k+1,k]
+            end
+        end
+        return true
+    else
+        return false
+    end
+end
+
+
+"""
+generic_bunchkaufman!(uplo, A, syhe, rook::Bool=false) ->
+LD<:AbstractMatrix, ipiv<:AbstractVector{Integer}, info::BlasInt
+
+Computes the Bunch-Kaufman factorization of a symmetric or Hermitian
+matrix `A` of size `NxN` as `P'*U*D*U'*P` or `P'*L*D*L'*P`, depending on
+which triangle is stored in `A`. Note that if `A` is complex symmetric
+then `U'` and `L'` denote the unconjugated transposes, i.e.
+`transpose(U)` and `transpose(L)`. The resulting `U` or `L` and D are
+stored in-place in `A`, LAPACK style. `LD` is just a reference to `A`
+(that is, `LD===A`). `ipiv` stores the permutation information of the
+algorithm in LAPACK format. `info` indicates whether the factorization
+was successful and non-singular when `info==0`, or else `info` takes a
+different value. The outputs `LD`, `ipiv`, `info` follow the format of
+the LAPACK functions of the Bunch-Kaufman factorization (`dsytrf`,
+`csytrf`, `chetrf`, etc.), so this function can (ideally) be used
+interchangeably with its LAPACK counterparts `LAPACK.sytrf!`,
+`LAPACK.sytrf_rook!`, etc.
+
+`uplo` is a character, either `'U'` or `'L'`, indicating whether the
+matrix is stored in the upper triangular part (`uplo=='U'`) or in the
+lower triangular part (`uplo=='L'`).
+
+`syhe` is a character, either `'S'` or `'H'`, indicating whether the
+matrix is real/complex symmetric (`syhe=='S'`, and the symmetric
+Bunch-Kaufman factorization is performed) or complex hermitian
+(`syhe=='H'`, and the hermitian Bunch-Kaufman factorization is
+performed).
+
+If `rook` is `true`, rook pivoting is used (also called bounded
+Bunch-Kaufman factorization). If `rook` is `false`, rook pivoting is
+not used (standard Bunch-Kaufman factorization). Rook pivoting can
+require up to `~N^3/6` extra comparisons in addition to the `~N^3/3`
+additions and `~N^3/3` multiplications of the standard Bunch-Kaufman
+factorization. However, rook pivoting guarantees that the entries of
+`U` or `L` are bounded.
+
+This function implements the factorization algorithm entirely in
+native Julia, so it supports any number type representing real or
+complex numbers.
+"""
+function generic_bunchkaufman!(
+    uplo::AbstractChar,
+    A::AbstractMatrix{TS},
+    syhe::AbstractChar,
+    rook::Bool=false
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Inputs must be 1-indexed; bounds may not be checked.
+    Base.require_one_based_indexing(A)
+
+    # Initialize info integer as 0
+    info = 0::BlasInt
+    # Get size of matrix
+    N, N2 = size(A)
+    # Initialize permutation vector
+    ipiv = Vector{BlasInt}(undef, N)
+
+    # Check input correctness
+    if uplo != 'U' && uplo != 'L'
+        info = (-1)::BlasInt
+    elseif N != N2
+        info = (-2)::BlasInt
+    elseif syhe != 'S' && syhe != 'H'
+        info = (-3)::BlasInt
+    end
+    if info < 0
+        arg_illegal("generic_bunchkaufman!", -info, 'W')
+        return A, ipiv, info
+    end
+    # if rook
+    #     error("Rook pivoting not implemented yet.")
+    # end
+
+    # Initialize `alpha` for use in choosing pivot block size.
+    # The exact value is
+    # (1 + sqrt(17)) / 8 ~= 0.6404
+    # For rational matrices we a the small denominator approximation:
+    # 16/25 = 0.64 ~= (1 + sqrt(17)) / 8
+    # in order to not increase the denominator size too much in computations.
+    # The error of this approximation is ≤0.1%, and it still guarantees that a
+    # 2x2 block in the D factor has a positive-negative eigenvalue pair, as long
+    # as the approximation lies in (0,1).
+    alpha = TR <: AbstractFloat ? (1 + sqrt(TR(17))) / 8 : TR(16//25)
+    # Use complex 1-norm for pivot selection, as in LAPACK
+    abs1_fun = TS <: Real ? abs : cabs1
+
+    # Check if the matrix is symmetric of hermitian
+    if syhe == 'S' || (syhe == 'H' && TS <: Real)
+        # Use symmetric variant if matrix is real, regardless of 'syhe' value
+        syhe = 'S'
+        diag_abs_fun = abs1_fun
+    else
+        diag_abs_fun = cabsr
+    end
+
+    # Compute machine safe minimum when working with floating point numbers.
+    # LAPACK doesn't use this for diagonal pivoting though...
+    if rook
+        if TR <: AbstractFloat
+            # eps(0) gives the smallest subnormal number, and eps(1) gives the floating
+            # point type epsilon. eps(0)/eps(1) gives the smallest normal number, plus
+            # possibly some rounding error.
+            sfmin = nextfloat(eps(TR(0)) / eps(TR(1)), 2)
+            small = 1 / prevfloat(typemax(TR), 2)
+            if small >= sfmin
+                # 1/sfmin may overflow, so use 'small' plus a bit as the safe minimum
+                sfmin = nextfloat(small * (1 + eps(TR(1))), 2)
+            end
+        else
+            # We're working with rationals in this case, so the all results are exact.
+            sfmin = TR(0)
+        end
+    end
+
+    # Run factorization depending on where the data is stored
+    upper = (uplo == 'U')
+    herm = (syhe == 'H')
+    # TODO: Is this gonna inline properly?
+    @inline k_cond = upper ? k -> k >= 1 : k -> k <= N
+    @inline irange = upper ? j -> (j:-1:1) : j -> (j:N)
+    @inline conj_op = herm ? conj : identity
+    @inline diagreal_op = herm ? (j -> A[j,j] = TS(real(A[j,j]))) : _ -> ()
+    k = upper ? N : 1
+    # Main loop, comments refer to the upper triangular version of the factorization.
+    # The lower triangular version is analogous.
+    while k_cond(k); @inbounds begin
+        kstep = 1
+        knext = upper ? k - 1 : k + 1
+        p = k
+        #   Determine rows and columns to be interchanged and whether
+        #   a 1-by-1 or 2-by-2 pivot block will be used
+        absakk = diag_abs_fun(A[k,k])
+        #   IMAX is the row-index of the largest off-diagonal element in
+        #    column K, and COLMAX is its absolute value.
+        #    Determine both COLMAX and IMAX.
+        if upper && k > 1
+            colmax, imax = findmax(abs1_fun, view(A, 1:(k-1), k))
+        elseif (!upper) && k < N
+            colmax, imax = findmax(abs1_fun, view(A, (k+1):N, k))
+            imax += k
+        else
+            colmax = 0
+        end
+        if (max(absakk, colmax) == 0) || isnan(absakk)
+            #   Column K is zero or underflow, or contains a NaN:
+            #   set INFO and continue
+            if info == 0
+                info = k::BlasInt
+            end
+            kp = k
+            if herm
+                # Force diagonal entry to be purely real
+                A[k,k] = real(A[k,k])
+            end
+        else
+            if absakk >= alpha*colmax
+            #   no interchange, use 1-by-1 pivot block
+                kp = k
+            elseif rook
+                #   Loop until pivot found
+                while true
+                    #   Begin pivot search loop body
+                    #   JMAX is the column-index of the largest off-diagonal
+                    #   element in row IMAX, and ROWMAX is its absolute value.
+                    #   Determine both ROWMAX and JMAX.
+                    if imax != k
+                        thisview = upper ? view(A, imax, (imax+1):k) :
+                            view(A, imax, k:(imax-1))
+                        rowmax, jmax = findmax(abs1_fun, thisview)
+                        jmax += upper ? imax : k - 1
+                    else
+                        # LAPACK makes rowmax=0 in this case, but I believe it's
+                        # better to make rowmax=-1, so that we guarantee that jmax
+                        # will be define in the next if-block.
+                        # TODO: is this correct/safe?
+                        rowmax = 0
+                    end
+                    if (upper && imax > 1) || ((!upper) && imax < N)
+                        # Remember that we only have the upper triangular part
+                        # of the matrix. We inspect the part of the row in the
+                        # lower triangular part by traversing the corresponding
+                        # part of the transpose column.
+                        if upper
+                            stemp, itemp = findmax(abs1_fun, view(A, 1:(imax-1), imax))
+                        else
+                            stemp, itemp = findmax(abs1_fun, view(A, (imax+1):N, imax))
+                            itemp += imax
+                        end
+                        if stemp > rowmax
+                            rowmax = stemp
+                            jmax = itemp
+                        end
+                    end
+                    #   Equivalent to testing for (used to handle NaN and Inf)
+                    #   CABS1( A( IMAX, IMAX ) ).GE.ALPHA*ROWMAX
+                    if !(diag_abs_fun(A[imax,imax]) < alpha*rowmax)
+                    #   interchange rows and columns K and IMAX,
+                    #   use 1-by-1 pivot block
+                        kp = imax
+                        break
+                    #   Equivalent to testing for ROWMAX .EQ. COLMAX,
+                    #   used to handle NaN and Inf
+                    elseif (p == jmax || rowmax <= colmax)
+                    #   interchange rows and columns K+1 and IMAX,
+                    #   use 2-by-2 pivot block
+                        kp = imax
+                        kstep = 2
+                        break
+                    else
+                    #   Pivot NOT found, set variables and repeat
+                        p = imax
+                        colmax = rowmax
+                        imax = jmax
+                    end
+                    #   End pivot search loop body
+                end
+            else
+                #   JMAX is the column-index of the largest off-diagonal
+                #   element in row IMAX, and ROWMAX is its absolute value
+                # We don't really need JMAX, se we don't store it
+                thisview = upper ? view(A, imax, (imax+1):k) : view(A, imax, k:(imax-1))
+                rowmax = findmax(abs1_fun, thisview)[1]
+                if (upper && imax > 1) || ((!upper) && imax < N)
+                    # Remember that we only have the upper triangular part
+                    # of the matrix. We inspect the part of the row in the
+                    # lower triangular part by traversing the corresponding
+                    # part of the transpose column.
+                    thisview = upper ? view(A, 1:(imax-1), imax) :
+                        view(A, (imax+1):N, imax)
+                    rowmax = max(rowmax, findmax(abs1_fun, thisview)[1])
+                end
+                if absakk >= alpha * colmax * (colmax/rowmax)
+                    #   no interchange, use 1-by-1 pivot block
+                    kp = k
+                elseif diag_abs_fun(A[imax,imax]) >= alpha * rowmax
+                    #   interchange rows and columns K and IMAX, use 1-by-1
+                    #   pivot block
+                    kp = imax
+                else
+                    #   interchange rows and columns K-1 and IMAX, use 2-by-2
+                    #   pivot block
+                    kp = imax
+                    p = imax
+                    kstep = 2
+                end
+            end
+            #   Swap TWO rows and TWO columns
+            #   First swap
+            # The first swap only needs to be done when using rook pivoting
+            if rook && kstep == 2
+                #   Interchange rows and column K and P in the leading
+                #   submatrix A(1:k,1:k) if we have a 2-by-2 pivot
+                bk_rowcol_swap!(A, k, p, 1, upper, herm)
+            end
+            #   Second swap
+            did_swap = bk_rowcol_swap!(A, k, kp, kstep, upper, herm)
+            if herm && (!did_swap)
+                # Force diagonal entries to be purely real
+                A[k,k] = real(A[k,k])
+                if kstep == 2
+                    A[knext,knext] = real(A[knext,knext])
+                end
+            end
+            if kstep == 1
+                #   1-by-1 pivot block D(k): column k now holds
+                #   W(k) = U(k)*D(k)
+                #   where U(k) is the k-th column of U
+                #   When rook=false, sfmin is not defined, but the short-circuit
+                #   evaluation of the conditional avoids an error.
+                if (!rook) || absakk >= sfmin
+                    #   Perform a rank-1 update of A(1:k-1,1:k-1) as
+                    #   A := A - U(k)*D(k)*U(k)' = A - W(k)*1/D(k)*W(k)'
+                    # Compute 1/D(k)
+                    r1 = !herm ? 1 / A[k,k] : 1 / real(A[k,k])
+                    # Perform rank-1 update to store the Schur complement
+                    # in a submatrix of A
+                    x = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k)
+                    # if 'upper' this should assign by reference
+                    thisview = upper ? A : view(A, (k+1):N, (k+1):N)
+                    generic_adr1!(uplo, -r1, x, x, thisview, syhe)
+                    #   Store U(k) in column k
+                    thisrange = upper ? (1:(k-1)) : ((k+1):N)
+                    for i in thisrange
+                        A[i,k] *= r1
+                    end
+                else
+                    # Compute D(k)
+                    r1 = !herm ? A[k,k] : real(A[k,k])
+                    #   Store U(k) in column k
+                    thisrange = upper ? (1:(k-1)) : ((k+1):N)
+                    for i in thisrange
+                        A[i,k] /= r1
+                    end
+                    #   Perform a rank-1 update of A(k+1:n,k+1:n) as
+                    #   A := A - U(k)*D(k)*U(k)**T
+                    #      = A - W(k)*(1/D(k))*W(k)**T
+                    #      = A - (W(k)/D(k))*(D(k))*(W(k)/D(K))**T
+                    # Perform rank-1 update to store the Schur complement
+                    # in a submatrix of A
+                    x = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k)
+                    # if 'upper' this should assign by reference
+                    thisview = upper ? A : view(A, (k+1):N, (k+1):N)
+                    generic_adr1!(uplo, -r1, x, x, thisview, syhe)
+                end
+            elseif (upper && k > 2) || ((!upper) && k < N - 1)
+                #   2-by-2 pivot block D(k): columns k and k-1 now hold
+                #   ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k)
+                #   where U(k) and U(k-1) are the k-th and (k-1)-th columns
+                #   of U
+                #   Perform a rank-2 update of A(1:k-2,1:k-2) as
+                #   A := A - ( U(k-1) U(k) )*D(k)*( U(k-1) U(k) )'
+                #   = A - ( W(k-1) W(k) )*inv(D(k))*( W(k-1) W(k) )'
+                thisrange = upper ? ((k-2):-1:1) : ((k+2):N)
+                if !herm
+                    # Real/complex symmetric case
+                    #TODO: is this way to compute the inverse backward stable?
+                    # (it probably is as it comes from LAPACK)
+                    dxk = A[knext,k]
+                    dxx = A[knext,knext] / dxk
+                    dkk = A[k,k] / dxk
+                    t = 1 / (dkk * dxx - 1)
+                    dxk = t / dxk
+                    dkx = dxk
+                else
+                    # Hermitian case
+                    # TODO: is this way to compute the inverse backward stable?
+                    # (it probably is as it is a small modification of LAPACK's
+                    # method)
+                    dxk = A[knext,k]
+                    dxx = real(A[knext,knext]) / dxk
+                    dkk = real(A[k,k]) / conj(dxk)
+                    t = 1 / (real(dkk * dxx) - 1)
+                    dkx = t / conj(dxk)
+                    dxk = t / dxk
+                end
+                for j in thisrange
+                    wknext = dxk * (dkk*A[j,knext] - A[j,k])
+                    wk = dkx * (dxx*A[j,k] - A[j,knext])
+                    for i in irange(j)
+                        A[i,j] -= (A[i,k]*conj_op(wk) + A[i,knext]*conj_op(wknext))
+                    end
+                    A[j,k] = wk
+                    A[j,knext] = wknext
+                    # Force diagonal entry to be purely real, but still of
+                    # complex type TS (I don't know why in LAPACK this
+                    # case, unlike the rest, enforces a complex type
+                    # explicitly).
+                    diagreal_op(j)
+                end
+            end
+        end
+        #   Store details of the interchanges in IPIV
+        if kstep == 1
+            ipiv[k] = kp
+        else
+            ipiv[k] = -p
+            ipiv[knext] = -kp
+        end
+        #   Decrease K and return to the start of the main loop
+        # k -= upper ? kstep : -kstep
+        if upper; k -= kstep; else; k += kstep; end
+    end; end
+    return A, ipiv, info
+end
+
+
+"""
+generic_syconv(F, gettri::Bool=true) ->
+(TLU<:Union{AbstractMatrix,Nothing}, e<:AbstractVector,
+    d<:Union{AbstractVector,Nothing})
+
+`generic_syconv` takes the Bunch-Kaufman object `F` and returns the
+block-diagonal factor `D`, and the triangular factor `L` (or `U`) if
+requested. If the `L` or `U` factor is requested then both `L` (or `U`) and
+the main diagonal of `D` will be stored in `TLU`, following LAPACK format,
+and `d` will be set to `nothing`. `e` contains the first subdiagonal of
+`D`. If the triangular factor is not requested, then `TLU` will not be set
+to `nothing`, and the main diagonal of `D` will be stored in `d`.
+
+`gettri` is a `Bool`, indicating whether the `L` (or `U`) triangular factor
+should be computed (`gettri==true`) or not (`gettri==false`). If the
+triangular factor is required, a copy of `A.LD` will be created, and the
+triangular factor will be computed in-place in said copy.
+"""
+function generic_syconv(
+    F::BunchKaufman{TS},
+    gettri::Bool=true
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Inputs must be 1-indexed; bounds may not be checked.
+    Base.require_one_based_indexing(F.LD, F.ipiv)
+
+    # Extract necessary variables
+    A, ipiv, rook = gettri ? deepcopy(F.LD) : F.LD, F.ipiv, F.rook
+
+    # Get size of matrix
+    N = size(A)[1]
+
+    # Initialize off-diagonal and diagonal vector
+    e = Vector{TS}(undef, N)
+    d = gettri ? nothing : diag(A, 0)
+
+    #   Quick return if possible
+    if N == 0; return gettri ? A : nothing, e, d; end
+
+    # Main loops
+    upper = (F.uplo == 'U')
+    @inline icond_d = upper ? i -> i > 1 : i -> i < N
+    @inline icond_T = upper ? i -> i >= 1 : i -> i <= N
+    @inline inext = upper ? i -> i - 1 : i -> i + 1
+    #   Convert VALUE
+    i = upper ? N : 1
+    e[N+1-i] = 0
+    while icond_d(i); @inbounds begin
+        if ipiv[i] < 0
+            ix = inext(i)
+            e[i] = A[ix,i]
+            e[ix] = 0
+            if gettri; A[ix,i] = 0; end
+            if upper; i -= 1; else; i += 1; end
+        else
+            e[i] = 0
+        end
+        if upper; i -= 1; else; i += 1; end
+    end; end
+    #   Convert PERMUTATIONS
+    if gettri
+        i = upper ? N : 1
+        while icond_T(i); @inbounds begin
+            thisview = upper ? view(A, :, (i+1):N) : view(A, :, 1:(i-1))
+            ip = ipiv[i]
+            if ip > 0 || rook
+                Base.swaprows!(thisview, abs(ip), i)
+            end
+            if ip <= 0
+                ix = inext(i)
+                Base.swaprows!(thisview, -ipiv[ix], ix)
+                if upper; i -= 1; else; i += 1; end
+            end
+            if upper; i -= 1; else; i += 1; end
+        end; end
+    end
+    return gettri ? A : nothing, e, d
+end
+
+
+"""
+generic_bksolve!(F, B) -> X<:AbstractVecOrMat
+
+`generic_bksolve!` solves a system of linear equations `A*X = B` where
+the Bunch-Kaufman factorization of `A` is provided by `F`.
+"""
+function generic_bksolve!(
+    F::BunchKaufman{TS},
+    B0::AbstractVecOrMat{TS},
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Inputs must be 1-indexed; bounds may not be checked.
+    Base.require_one_based_indexing(F.LD, F.ipiv, B0)
+
+    # Get size of matrices
+    N = size(F.LD)[1]
+    if typeof(B0) <: AbstractVector
+        N3 = size(B0)[1]
+        M = 1
+        B = view(B0, :, :)
+    else
+        N3, M = size(B0)
+        B = B0
+    end
+
+    # Initialize info integer as 0
+    info = 0::BlasInt
+
+    # Check input correctness
+    if N3 != N
+        info = (-2)::BlasInt
+    end
+    if info < 0
+        arg_illegal("generic_bksolve!", -info, 'E')
+    end
+
+    #   Quick return if possible
+    if N == 0 || M == 0; return B; end
+
+    # Extract necessary variables
+    A, ipiv, symm, rook = F.LD, F.ipiv, issymmetric(F), F.rook
+
+    # Load the requested adjoining operator
+    adj_op = symm ? identity : conj
+
+    R1 = TR(1)
+    upper = (F.uplo == 'U')
+    @inline kcond1 = upper ? k -> k >= 1 : k -> k <= N
+    @inline kcond2 = upper ? k -> k <= N : k -> k >= 1
+    @inline knext = upper ? k -> k - 1 : k -> k + 1
+    @inline knext2 = upper ? k -> k + 1 : k -> k - 1
+    k = upper ? N : 1
+    while kcond1(k); @inbounds begin
+        kp = ipiv[k]
+        if kp > 0
+            #   1 x 1 diagonal block
+            #   Interchange rows K and IPIV(K).
+            Base.swaprows!(B, k, kp)
+            #   Multiply by inv(U(K)), where U(K) is the transformation
+            #   stored in column K of A.
+            Aview = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k)
+            Bview = upper ? B : view(B, (k+1):N, :)
+            generic_adr1!('F', -R1, Aview, view(B, k, :), Bview, 'S')
+            #   Multiply by the inverse of the diagonal block.
+            s = symm ? 1 / A[k,k] : 1 / real(A[k,k])
+            for j in 1:M; B[k,j] *= s; end
+            if upper; k -= 1; else; k += 1; end
+        else
+            #   2 x 2 diagonal block
+            #   Interchange rows K and -IPIV(K) THEN K-1 and -IPIV(K-1)
+            # The first interchange is only needed when rook pivoting is used
+            if rook; Base.swaprows!(B, k, -kp); end
+            kx = knext(k)
+            Base.swaprows!(B, kx, -ipiv[kx])
+            #   Multiply by inv(U(K)), where U(K) is the transformation
+            #   stored in columns K-1 and K of A.
+            Aview = upper ? view(A, 1:(k-2), k) : view(A, (k+2):N, k)
+            Bview = upper ? B : view(B, (k+2):N, :)
+            generic_adr1!('F', -R1, Aview, view(B, k, :), Bview, 'S')
+            Aview = upper ? view(A, 1:(k-2), kx) : view(A, (k+2):N, kx)
+            generic_adr1!('F', -R1, Aview, view(B, kx, :), Bview, 'S')
+            #   Multiply by the inverse of the diagonal block.
+            axk = A[kx,k]
+            axx = A[kx,kx] / axk
+            akk = A[k,k] / adj_op(axk)
+            denom = axx*akk - 1
+            for j in 1:M
+                bx = B[kx,j] / axk
+                bk = B[k,j] / adj_op(axk)
+                B[kx,j] = (akk*bx - bk) / denom
+                B[k,j] = (axx*bk - bx) / denom
+            end
+            if upper; k -= 2; else; k += 2; end
+        end
+    end; end
+    #   Next solve U'*X = B, overwriting B with X.
+    #   K is the main loop index, increasing from 1 to N in steps of
+    #   1 or 2, depending on the size of the diagonal blocks.
+    k = upper ? 1 : N
+    while kcond2(k); @inbounds begin
+        Aview = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k)
+        Bview = upper ? view(B, 1:(k-1), :) : view(B, (k+1):N, :)
+        B_row = view(B, k, :)
+        kp = ipiv[k]
+        if kp > 0
+            #   1 x 1 diagonal block
+            #   Multiply by inv(U**T(K)), where U(K) is the transformation
+            #   stored in column K of A.
+            if symm
+                generic_mvpv!('T', -R1, Bview, Aview, R1, B_row)
+            else
+                conj!(B_row)
+                generic_mvpv!('C', -R1, Bview, Aview, R1, B_row)
+                conj!(B_row)
+            end
+            #   Interchange rows K and IPIV(K).
+            Base.swaprows!(B, k, kp)
+            if upper; k += 1; else; k -= 1; end
+        else
+            #   2 x 2 diagonal block
+            #   Multiply by inv(U**T(K+1)), where U(K+1) is the transformation
+            #   stored in columns K and K+1 of A.
+            kx = knext2(k)
+            if symm
+                generic_mvpv!('T', -R1, Bview, Aview, R1, B_row)
+                Aview = upper ? view(A, 1:(k-1), kx) : view(A, (k+1):N, kx)
+                B_row = view(B, kx, :)
+                generic_mvpv!('T', -R1, Bview, Aview, R1, B_row)
+            elseif k > 1
+                conj!(B_row)
+                generic_mvpv!('C', -R1, Bview, Aview, R1, B_row)
+                conj!(B_row)
+                Aview = upper ? view(A, 1:(k-1), kx) : view(A, (k+1):N, kx)
+                B_row = view(B, kx, :)
+                conj!(B_row)
+                generic_mvpv!('C', -R1, Bview, Aview, R1, B_row)
+                conj!(B_row)
+            end
+            #   Interchange rows K and -IPIV(K) THEN K+1 and -IPIV(K+1).
+            # The second interchange is only needed when rook pivoting is used
+            Base.swaprows!(B, k, -kp)
+            if rook; Base.swaprows!(B, kx, -ipiv[kx]); end
+            if upper; k += 2; else; k -= 2; end
+        end
+    end; end
+    return B
+end
+
+
+"""
+inertia(B::BunchKaufman; atol::Real=0, rtol::Real=atol>0 ? 0 : n*ϵ) ->
+    np::Union{Nothing,Integer}, nn::Union{Nothing,Integer}, nz::Integer
+
+`inertia` computes the numerical inertia (the number of positive,
+negative and zero eigenvalues, given by `np`, `nn` and `nz`,
+respectively) of a real symmetric of Hermitian matrix `B` that has been
+factored using the Bunch-Kaufman algorithm. For complex symmetric
+matrices the inertia is not defined. in that case `np` and `nn` are set
+to `nothing`, but the function still returns the number of zero
+eigenvalues. The inertia is computed by counting the eigenvalues signs
+of `B.D`. The number of zero eigenvalues is computed as the number of
+estimated eigenvalues with complex 1-norm (defined as `|re(.)|+|im(.)|`)
+less or equal than `max(atol, rtol*s₁)`, where `s₁` is an upper bound of
+the largest singular value of `B.D`, `σ₁` (more specifically,
+`0.5*s₁ <= σ₁ <= s₁` for real matrices and `0.35*s₁ <= σ₁ <= s₁` for
+complex matrices). `atol` and `rtol` are the absolute and relative
+tolerances, respectively. The default relative tolerance is `n*ϵ`, where
+`n` is the size of  of `A`, and `ϵ` is the [`eps`](@ref) of the number
+type of `A`, if this type is a subtype of `AbstractFloat`. In any other
+case (if the number type of `A` is `Rational`, for example) `ϵ` is set
+to `0`.
+
+!!! note
+    Numerical inertia can be a sensitive and imprecise characterization of
+    ill-conditioned matrices with eigenvalues that are close in magnitude to the
+    threshold tolerance `max(atol, rtol*s₁)`. In such cases, slight perturbations
+    to the Bunch-Kaufman computation or to the matrix can change the result of
+    `rank` by pushing one or more eigenvalues across the threshold. These
+    variations can even occur due to changes in floating-point errors between
+    different Julia versions, architectures, compilers, or operating systems.
+    In particular, the size of the entries of the tringular factor directly
+    influende the scale of the eigenvalues of the diagonal factor, so it is
+    strongly recommended to use rook pivoting is the inertia is going to be
+    computed.
+    On the other hand, if the matrix has rational entries, the inertia
+    computation is guaranteed is to be exact, as long as there is no
+    under/overflow in the underlying integer type (and in such cases Julia itself
+    throws an error), or a positive tolerance (absolute or relative) is
+    specified.
+"""
+function inertia(B::BunchKaufman{TS};
+    atol::TR = TR(0),
+    rtol::TR = TR(0)
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+
+    # Check if matrix is complex symmetric
+    get_inertia = !(issymmetric(B) && TS <: Complex)
+
+    # Initialize outputs
+    np, nn, nz = get_inertia ? (0, 0, 0) : (nothing, nothing, 0)
+
+    # Compute matrix size
+    N = size(B, 1)
+
+    # Quick return if possible
+    if N == 0; return np, nn, nz; end
+
+    # Compute default relative tolerance
+    if rtol <= 0 && atol <= 0
+        rtol = TR <: AbstractFloat ? (N * eps(TR)) : TR(0)
+    end
+
+    # We use the complex 1-norm for complex matrices
+    real_matrix = (TS <: Real)
+    abs1_fun = real_matrix ? abs : cabs1
+    real_fun = real_matrix ? identity : real
+
+    # Check if we must track the largest singular value
+    get_s1 = (rtol > 0)
+
+    # Constant for lower bound estimation of the smallest eigenvalue in 2x2 blocks.
+    # The best (largest) value for complex matrices is 1/sqrt(2), but for rational
+    # matrices we use the small denominator approximation 12/17, in order to not
+    # increase the denominator size too much in computations. The error of this
+    # approximation is ≤0.2%, and we still get a valid lower bound.
+    c = real_matrix ? TR(1) : (TR <: AbstractFloat ? 1/sqrt(TR(2)) : TR(12//17))
+
+    # First pass, estimate largest singular value and group together size-1 blocks
+    D = B.D
+    s1 = TR(0)
+    i = 1
+    while i <= N; @inbounds begin
+        if i < N && D[i,i+1] != 0
+            # 2x2 block
+            # The largest singular value of a 2x2 matrix is between [1, 2] times
+            # its complex max-norm, which is between [c, 1] times the largest
+            # complex 1-norm among the entries of the 2x2 matrix. See "Roger
+            # Horn and Charles Johnson. Matrix Analysis, 2nd Edition, 5.6.P23".
+            abs_Dii = abs1_fun(D[i,i])
+            abs_Dxx = abs1_fun(D[i+1,i+1])
+            s1_block = 2 * max(abs_Dii, abs1_fun(D[i,i+1]), abs_Dxx)
+            if get_s1; s1 = max(s1, s1_block); end
+            # Lower bound on the smallest eigenvalue complex 2-norm is
+            # abs(λ₂) ≥ abs(det(block)) / s1_block
+            # so the bound in terms of the complex 1-norm becomes
+            # abs1_fun(λ₂) ≥ c * abs1_fun(det(block)) / s1_block
+            # For rational matrices, if λ₂=0 then det(block)=0 and then the bound
+            # becomes zero too. If λ₁=0 too then the block has all zero entries
+            # and 's1_block'=0, but 'D[i,i+1]' != 0 and so 's1_block' > 0. However, we
+            # may still have that 'smin_block'≈0, then the value of 'smin_block' may not
+            # be accurate. In that case the counting routine will detect that both
+            # eigenvalues are zero without using 'smin_block', so it doesn't matter.
+            # TODO: is this the most numerically stable way to compute the determinant?
+            # TODO: is this the best way to avoid under/overflow?
+            if abs_Dii >= abs_Dxx
+                smin_block = c * abs1_fun((D[i,i]/s1_block)*D[i+1,i+1] -
+                    (D[i,i+1]/s1_block)*D[i+1,i])
+            else
+                smin_block = c * abs1_fun(D[i,i]*(D[i+1,i+1]/s1_block) -
+                    (D[i,i+1]/s1_block)*D[i+1,i])
+            end
+            # Store lower bound in-place in the lower off-diagonal and upper bound
+            # in-place in the upper off-diagonal. The trace is stored in the first
+            # diagonal entry block, but only if the full inertia is needed.
+            D[i,i+1] = s1_block
+            D[i+1,i] = smin_block
+            if get_inertia; D[i,i] += D[i+1,i+1]; end
+            i += 2
+        else
+            # 1x1 block
+            if get_s1; s1 = max(s1, abs1_fun(D[i,i])); end
+            i += 1
+        end
+    end; end
+
+    # Second pass, count eigenvalue signs
+    tol = max(atol, rtol * s1)
+    i = 1
+    while i <= N; @inbounds begin
+        if i < N && D[i,i+1] != 0
+            # 2x2 block. For the counting of zero eigenvalues we use the lower bound on the
+            # eigenvalues' magnitude. This way, if an eigenvalue is deemed non-zero, then
+            # it is guaranteed that its magnitude is greater than the tolerance.
+            s1_block = real_fun(D[i,i+1])
+            if (c / 2) * s1_block <= tol
+                # Lower bound of largest eigenvalue is smaller than the tolerance,
+                # we consider the both eigenvalues of this block to be zero.
+                nz += 2
+                i += 2
+                continue
+            end
+            # Reaching this part of the lopp implies that 's1_block' != 0.
+            smin_block = real_fun(D[i+1,i])
+            trace_block = real_fun(D[i,i])
+            if smin_block > tol || trace_block == 0
+                # If first condition holds then the lower bound of the smallest eigenvalue
+                # is larger than the tolerance. If the second condition holds then the trace
+                # is exactly zero, so both eigenvalues have the same magnitude, and we
+                # already know that the largest one is non-zero. In any case we conclude
+                # that both eigenvalues are non-zero.
+                if get_inertia
+                    # The eigenvalues of a 2x2 block are guaranteed to be a
+                    # positive-negative pair.
+                    np += 1
+                    nn += 1
+                end
+            else
+                # The lower bound of smallest eigenvalue is smaller than the tolerance and
+                # the trace is non-zero, so we consider the smallest eigenvalues of this
+                # block to be zero.
+                nz += 1
+                if get_inertia
+                    # The trace is non-zero, and its sign is the same of the largest
+                    # eigenvalue.
+                    if trace_block >= 0
+                        np += 1
+                    else
+                        nn += 1
+                    end
+                end
+            end
+            i += 2
+        else
+            # 1x1 block
+            if get_inertia
+                eig = real_fun(D[i,i])
+                if eig > tol
+                    np += 1
+                elseif eig < -tol
+                    nn += 1
+                else
+                    nz += 1
+                end
+            elseif abs1_fun(D[i,i]) <= tol
+                nz += 1
+            end
+            i += 1
+        end
+    end; end
+
+    return np, nn, nz
+end
+
+
+"""
+    bunchkaufman_native!(A, rook::Bool=false; check = true) -> BunchKaufman
+
+`bunchkaufman_native!` is the same as [`bunchkaufman!`](@ref), but it performs
+the factorization in native Julia code instead of calling LAPACK.
+"""
+function bunchkaufman_native!(A::AbstractMatrix{TS},
+    rook::Bool = false;
+    check::Bool = true,
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    if A isa RealHermSymComplexSym{TR}
+        syhe = 'S'
+    elseif ishermitian(A)
+        syhe = 'H'
+    elseif issymmetric(A)
+        syhe = 'S'
+    else
+        throw(ArgumentError("Bunch-Kaufman decomposition is only valid for " *
+            "symmetric or Hermitian matrices"))
+    end
+    if A isa HermOrSym
+        Adata = A.data
+        uplo = A.uplo
+    else
+        Adata = A
+        uplo = 'U'
+    end
+    LD, ipiv, info = generic_bunchkaufman!(uplo, Adata, syhe, rook)
+    check && checknonsingular(info)
+    return BunchKaufman(LD, ipiv, uplo, syhe == 'S', rook, info)
+end
+
+
+"""
+Overload 'bunchkaufman.jl' methods through multiple dispatch
+"""
+
+function bunchkaufman!(A::AbstractMatrix{TS},
+    rook::Bool = false;
+    check::Bool = true
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    return bunchkaufman_native!(A, rook; check)
+end
+
+function bunchkaufman(A::AbstractMatrix{TS},
+    rook::Bool = false;
+    check::Bool = true
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    return bunchkaufman!(eigencopy_oftype(A, TS), rook; check)
+end
+
+function bunchkaufman(A::AbstractMatrix{TS},
+    rook::Bool = false;
+    check::Bool = true
+    ) where TS <:Union{TI, Complex{TI}} where TI <: Integer
+
+    # Identity whether matrix is symmetric or Hermitian or none
+    if A isa Symmetric
+        TA = Symmetric
+    elseif A isa Hermitian
+        TA = Hermitian
+    else
+        TA = Nothing
+    end
+
+    # Create a rational copy of input integer matrix, as the Bunch-Kaufman
+    # algorithm is closed over the rationals but not over the integers.
+    # We promote input to BigInt to avoid overflow problems
+    if TA == Nothing
+        if TS <: Integer
+            M = Rational{BigInt}.(eigencopy_oftype(A, TS))
+        else
+            M = Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS))
+        end
+    else
+        if TS <: Integer
+            M = TA(Rational{BigInt}.(eigencopy_oftype(A, TS)), Symbol(A.uplo))
+        else
+            M = TA(Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS)),
+                Symbol(A.uplo))
+        end
+    end
+
+    return bunchkaufman_native!(M, rook; check)
+end
+
+function ldiv!(B::BunchKaufman{TS},
+    R::AbstractVecOrMat{TS}
+    ) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    return generic_bksolve!(B, R)
+end
+
+function inv(B::BunchKaufman{TS}) where TS <: ClosedScalar{TR} where TR <: ClosedReal
+    # I don't think there's value in implementing tha LAPACK in-place inverse
+    # functions `dsytri`, `chetri`, etc., unless of course an efficient
+    # in-place inverse function `inv!` is needed.
+    # TODO: reduce the operation count of the inverse by not computing the
+    # lower/upper triangular part.
+    if issymmetric(B)
+        return copytri!(B \ I, B.uplo)
+    else
+        return copytri!(B \ I, B.uplo, true)
+    end
+end
diff --git a/stdlib/LinearAlgebra/test/bunchkaufman.jl b/stdlib/LinearAlgebra/test/bunchkaufman.jl
index 613e4d09a3cc6..d2305844db63e 100644
--- a/stdlib/LinearAlgebra/test/bunchkaufman.jl
+++ b/stdlib/LinearAlgebra/test/bunchkaufman.jl
@@ -21,9 +21,24 @@ a2img  = randn(n,n)/2
 breal = randn(n,2)/2
 bimg  = randn(n,2)/2
 
-@testset "$eltya argument A" for eltya in (Float32, Float64, ComplexF32, ComplexF64, Int)
-    a = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(areal, aimg) : areal)
-    a2 = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(a2real, a2img) : a2real)
+areint = rand(1:7, n, n)
+aimint = rand(1:7, n, n)
+a2reint = rand(1:7, n, n)
+a2imint = rand(1:7, n, n)
+breint = rand(1:5, n, 2)
+bimint = rand(1:5, n, 2)
+
+@testset "$eltya argument A" for eltya in (Float32, Float64, ComplexF32, ComplexF64, Int, ###
+    Float16, Complex{Float16}, BigFloat, Complex{BigFloat}, Complex{Int}, BigInt,
+    Complex{BigInt}, Rational{BigInt}, Complex{Rational{BigInt}})
+    # a = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(areal, aimg) : areal)
+    # a2 = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(a2real, a2img) : a2real)
+    a = convert(Matrix{eltya}, eltya <: Complex ? (real(eltya) <: AbstractFloat ?
+        complex.(areal, aimg) : complex.(areint, aimint)) : (eltya <: AbstractFloat ?
+        areal : areint))
+    a2 = convert(Matrix{eltya}, eltya <: Complex ? (real(eltya) <: AbstractFloat ?
+        complex.(a2real, a2img) : complex.(a2reint, a2imint)) : (eltya <: AbstractFloat ?
+        a2real : a2reint))
     asym = transpose(a) + a                  # symmetric indefinite
     aher = a' + a                  # Hermitian indefinite
     apd  = a' * a                  # Positive-definite
@@ -34,9 +49,39 @@ bimg  = randn(n,2)/2
                                 view(apd , 1:n, 1:n)))
         ε = εa = eps(abs(float(one(eltya))))
 
+        # Inertia tests
+        @testset "$uplo Bunch-Kaufman factor inertia" for uplo in (:L, :U)
+            @testset "rook pivoting: $rook" for rook in (false, true)
+                test_list = eltya <: Complex ? (Hermitian(aher, uplo), Hermitian(apd, uplo)) :
+                    (Symmetric(transpose(a) + a, uplo), Hermitian(aher, uplo),
+                Hermitian(apd, uplo))
+                ελ = n*max(eps(Float64), εa) # zero-eigenvalue threshold
+                ελ = typeof(Integer(one(real(eltya)))) <: Signed ? Rational{BigInt}(ελ) :
+                    real(eltya(ελ))
+                for M in test_list
+                    bc = bunchkaufman(M, rook)
+                    D = bc.D
+                    λ = real(eltya <: Complex ? eigen(ComplexF64.(D)).values :
+                        eigen(Float64.(D)).values)
+                    σ₁ = norm(λ, Inf)
+                    np = sum(λ .> ελ*σ₁)
+                    nn = sum(λ .< -ελ*σ₁)
+                    nz = n - np - nn
+                    if real(eltya) <: AbstractFloat
+                        @test inertia(bc) == (np, nn, nz)
+                    else
+                        @test inertia(bc; rtol=ελ) == (np, nn, nz)
+                    end
+                end
+            end
+        end
+
         # check that factorize gives a Bunch-Kaufman
-        @test isa(factorize(asym), LinearAlgebra.BunchKaufman)
-        @test isa(factorize(aher), LinearAlgebra.BunchKaufman)
+        if eltya <: Union{Float32, Float64, ComplexF32, ComplexF64, Int}
+            # Default behaviour only uses Bunch-Kaufman for these types, for now.
+            @test isa(factorize(asym), LinearAlgebra.BunchKaufman)
+            @test isa(factorize(aher), LinearAlgebra.BunchKaufman)
+        end
         @testset "$uplo Bunch-Kaufman factor of indefinite matrix" for uplo in (:L, :U)
             bc1 = bunchkaufman(Hermitian(aher, uplo))
             @test LinearAlgebra.issuccess(bc1)
@@ -89,15 +134,25 @@ bimg  = randn(n,2)/2
             @test Base.propertynames(bc1) == (:p, :P, :L, :U, :D)
         end
 
-        @testset "$eltyb argument B" for eltyb in (Float32, Float64, ComplexF32, ComplexF64, Int)
-            b = eltyb == Int ? rand(1:5, n, 2) : convert(Matrix{eltyb}, eltyb <: Complex ? complex.(breal, bimg) : breal)
+        @testset "$eltyb argument B" for eltyb in (Float32, Float64, ComplexF32, ComplexF64, Int, ###
+            Float16, Complex{Float16}, BigFloat, Complex{BigFloat}, Complex{Int}, BigInt,
+            Complex{BigInt}, Rational{BigInt}, Complex{Rational{BigInt}})
+            # b = eltyb == Int ? rand(1:5, n, 2) : convert(Matrix{eltyb}, eltyb <: Complex ? complex.(breal, bimg) : breal)
+            b = convert(Matrix{eltyb}, eltyb <: Complex ? (real(eltyb) <: AbstractFloat ?
+                complex.(breal, bimg) : complex.(breint, bimint)) : (eltyb <: AbstractFloat ?
+                breal : breint))
             for b in (b, view(b, 1:n, 1:2))
                 εb = eps(abs(float(one(eltyb))))
                 ε = max(εa,εb)
+                epsc = eltya <: Complex ? sqrt(2)*n : n # tolerance scale
 
                 @testset "$uplo Bunch-Kaufman factor of indefinite matrix" for uplo in (:L, :U)
                     bc1 = bunchkaufman(Hermitian(aher, uplo))
-                    @test aher*(bc1\b) ≈ b atol=1000ε
+                    # @test aher*(bc1\b) ≈ b atol=1000ε
+                    cda = eltya <: Complex ? cond(ComplexF64.(aher)) : cond(Float64.(aher))
+                    cda = real(eltya) <: AbstractFloat ? real(eltya(cda)) : cda
+                    @test norm(aher*(bc1\b) - b) <= epsc*sqrt(eps(cda))*max(
+                            norm(aher*(bc1\b)), norm(b))
                 end
 
                 @testset "$uplo Bunch-Kaufman factors of a pos-def matrix" for uplo in (:U, :L)
@@ -112,8 +167,14 @@ bimg  = randn(n,2)/2
                         @test logdet(bc2) ≈ log(det(bc2))
                         @test logabsdet(bc2)[1] ≈ log(abs(det(bc2)))
                         @test logabsdet(bc2)[2] == sign(det(bc2))
-                        @test inv(bc2)*apd ≈ Matrix(I, n, n)
-                        @test apd*(bc2\b) ≈ b rtol=eps(cond(apd))
+                        # @test inv(bc2)*apd ≈ Matrix(I, n, n) rtol=Base.rtoldefault(real(eltya))
+                        # @test apd*(bc2\b) ≈ b rtol=eps(cond(apd))
+                        @test norm(inv(bc2)*apd - Matrix(I, n, n)) <= epsc*Base.rtoldefault(
+                            real(eltya))*max(norm(inv(bc2)*apd), norm(Matrix(I, n, n)))
+                        cda = eltya <: Complex ? cond(ComplexF64.(apd)) : cond(Float64.(apd))
+                        cda = real(eltya) <: AbstractFloat ? real(eltya(cda)) : cda
+                        @test norm(apd*(bc2\b) - b) <= epsc*sqrt(eps(cda))*max(
+                            norm(apd*(bc2\b)), norm(b))
                         @test ishermitian(bc2)
                         @test !issymmetric(bc2) || eltya <: Real
                     end

From 2091058c5a2be7b70f3cad0cf1ce0ea7391863b0 Mon Sep 17 00:00:00 2001
From: tecosaur <git@tecosaur.net>
Date: Sat, 30 Dec 2023 20:20:28 +0800
Subject: [PATCH 11/13] Fix :noshift construction of an empty SubString
 (#51923)

---
 base/strings/substring.jl | 2 +-
 test/strings/basic.jl     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index dfd8770b08d47..ffa78c00cb885 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -37,7 +37,7 @@ struct SubString{T<:AbstractString} <: AbstractString
         return new(s, i-1, nextind(s,j)-i)
     end
     function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString
-        @boundscheck begin
+        @boundscheck if !(i == j == 0)
             si, sj = i + 1, prevind(s, j + i + 1)
             @inbounds isvalid(s, si) || string_index_err(s, si)
             @inbounds isvalid(s, sj) || string_index_err(s, sj)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 225a41bf12be9..87d812c5bf201 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -203,6 +203,12 @@ end
         @test (@views (x[3], x[1:2], x[[1,4]])) isa Tuple{Char, SubString, String}
         @test (@views (x[3], x[1:2], x[[1,4]])) == ('c', "ab", "ad")
     end
+
+    @testset ":noshift constructor" begin
+        @test SubString("", 0, 0, Val(:noshift)) == ""
+        @test SubString("abcd", 0, 1, Val(:noshift)) == "a"
+        @test SubString("abcd", 0, 4, Val(:noshift)) == "abcd"
+    end
 end
 
 

From fe0db7d9474781ee949c7927f806214c7fc00a9a Mon Sep 17 00:00:00 2001
From: Jian Fang <131804380+JianFangAtRai@users.noreply.github.com>
Date: Sat, 30 Dec 2023 06:46:53 -0800
Subject: [PATCH 12/13] heap snapshot: add gc roots and gc finalist roots to
 fix unrooted nodes (#52618)

---
 src/gc-heap-snapshot.cpp | 124 +++++++++++++++++++++++++++++++--------
 src/gc-heap-snapshot.h   |  33 +++++++++--
 src/gc.c                 |  33 +++++++++--
 src/gc.h                 |   2 +-
 4 files changed, 157 insertions(+), 35 deletions(-)

diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index 1875a22f64ff6..24df141d053f4 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -3,6 +3,7 @@
 #include "gc-heap-snapshot.h"
 
 #include "julia_internal.h"
+#include "julia_assert.h"
 #include "gc.h"
 
 #include "llvm/ADT/SmallVector.h"
@@ -12,8 +13,11 @@
 #include <vector>
 #include <string>
 #include <sstream>
+#include <iostream>
+#include <set>
 
 using std::string;
+using std::set;
 using std::ostringstream;
 using std::pair;
 using std::make_pair;
@@ -116,6 +120,8 @@ struct HeapSnapshot {
     DenseMap<void *, size_t> node_ptr_to_index_map;
 
     size_t num_edges = 0; // For metadata, updated as you add each edge. Needed because edges owned by nodes.
+    size_t _gc_root_idx = 1; // node index of the GC roots node
+    size_t _gc_finlist_root_idx = 2; // node index of the GC finlist roots node
 };
 
 // global heap snapshot, mutated by garbage collector
@@ -128,13 +134,13 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one
 static inline void _record_gc_edge(const char *edge_type,
                                    jl_value_t *a, jl_value_t *b, size_t name_or_index) JL_NOTSAFEPOINT;
 void _record_gc_just_edge(const char *edge_type, Node &from_node, size_t to_idx, size_t name_or_idx) JL_NOTSAFEPOINT;
-void _add_internal_root(HeapSnapshot *snapshot);
+void _add_synthetic_root_entries(HeapSnapshot *snapshot);
 
 
 JL_DLLEXPORT void jl_gc_take_heap_snapshot(ios_t *stream, char all_one)
 {
     HeapSnapshot snapshot;
-    _add_internal_root(&snapshot);
+    _add_synthetic_root_entries(&snapshot);
 
     jl_mutex_lock(&heapsnapshot_lock);
 
@@ -156,10 +162,12 @@ JL_DLLEXPORT void jl_gc_take_heap_snapshot(ios_t *stream, char all_one)
     serialize_heap_snapshot((ios_t*)stream, snapshot, all_one);
 }
 
-// adds a node at id 0 which is the "uber root":
-// a synthetic node which points to all the GC roots.
-void _add_internal_root(HeapSnapshot *snapshot)
+// mimicking https://github.com/nodejs/node/blob/5fd7a72e1c4fbaf37d3723c4c81dce35c149dc84/deps/v8/src/profiler/heap-snapshot-generator.cc#L212
+// add synthetic nodes for the uber root, the GC roots, and the GC finalizer list roots
+void _add_synthetic_root_entries(HeapSnapshot *snapshot)
 {
+    // adds a node at id 0 which is the "uber root":
+    // a synthetic node which points to all the GC roots.
     Node internal_root{
         snapshot->node_types.find_or_create_string_id("synthetic"),
         snapshot->names.find_or_create_string_id(""), // name
@@ -170,6 +178,44 @@ void _add_internal_root(HeapSnapshot *snapshot)
         SmallVector<Edge, 0>() // outgoing edges
     };
     snapshot->nodes.push_back(internal_root);
+
+    // Add a node for the GC roots
+    snapshot->_gc_root_idx = snapshot->nodes.size();
+    Node gc_roots{
+        snapshot->node_types.find_or_create_string_id("synthetic"),
+        snapshot->names.find_or_create_string_id("GC roots"), // name
+        snapshot->_gc_root_idx, // id
+        0, // size
+        0, // size_t trace_node_id (unused)
+        0, // int detachedness;  // 0 - unknown,  1 - attached;  2 - detached
+        SmallVector<Edge, 0>() // outgoing edges
+    };
+    snapshot->nodes.push_back(gc_roots);
+    snapshot->nodes.front().edges.push_back(Edge{
+        snapshot->edge_types.find_or_create_string_id("internal"),
+        snapshot->names.find_or_create_string_id("GC roots"), // edge label
+        snapshot->_gc_root_idx // to
+    });
+    snapshot->num_edges += 1;
+
+    // add a node for the gc finalizer list roots
+    snapshot->_gc_finlist_root_idx = snapshot->nodes.size();
+    Node gc_finlist_roots{
+        snapshot->node_types.find_or_create_string_id("synthetic"),
+        snapshot->names.find_or_create_string_id("GC finalizer list roots"), // name
+        snapshot->_gc_finlist_root_idx, // id
+        0, // size
+        0, // size_t trace_node_id (unused)
+        0, // int detachedness;  // 0 - unknown,  1 - attached;  2 - detached
+        SmallVector<Edge, 0>() // outgoing edges
+    };
+    snapshot->nodes.push_back(gc_finlist_roots);
+    snapshot->nodes.front().edges.push_back(Edge{
+        snapshot->edge_types.find_or_create_string_id("internal"),
+        snapshot->names.find_or_create_string_id("GC finlist roots"), // edge label
+        snapshot->_gc_finlist_root_idx // to
+    });
+    snapshot->num_edges += 1;
 }
 
 // mimicking https://github.com/nodejs/node/blob/5fd7a72e1c4fbaf37d3723c4c81dce35c149dc84/deps/v8/src/profiler/heap-snapshot-generator.cc#L597-L597
@@ -327,6 +373,26 @@ void _gc_heap_snapshot_record_root(jl_value_t *root, char *name) JL_NOTSAFEPOINT
     _record_gc_just_edge("internal", internal_root, to_node_idx, edge_label);
 }
 
+void _gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT
+{
+    record_node_to_gc_snapshot(root);
+
+    auto from_node_idx = g_snapshot->_gc_root_idx;
+    auto to_node_idx = record_node_to_gc_snapshot(root);
+    auto edge_label = g_snapshot->names.find_or_create_string_id(name);
+    _record_gc_just_edge("internal", g_snapshot->nodes[from_node_idx], to_node_idx, edge_label);
+}
+
+void _gc_heap_snapshot_record_finlist(jl_value_t *obj, size_t index) JL_NOTSAFEPOINT
+{
+    auto from_node_idx = g_snapshot->_gc_finlist_root_idx;
+    auto to_node_idx = record_node_to_gc_snapshot(obj);
+    ostringstream ss;
+    ss << "finlist-" << index;
+    auto edge_label = g_snapshot->names.find_or_create_string_id(ss.str());
+    _record_gc_just_edge("internal", g_snapshot->nodes[from_node_idx], to_node_idx, edge_label);
+}
+
 // Add a node to the heap snapshot representing a Julia stack frame.
 // Each task points at a stack frame, which points at the stack frame of
 // the function it's currently calling, forming a linked list.
@@ -393,27 +459,19 @@ void _gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_value_t *to, void
                     g_snapshot->names.find_or_create_string_id(path));
 }
 
-void _gc_heap_snapshot_record_module_to_binding(jl_module_t *module, jl_binding_t *binding) JL_NOTSAFEPOINT
+void _gc_heap_snapshot_record_module_to_binding(jl_module_t *module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT
 {
-    jl_globalref_t *globalref = binding->globalref;
-    jl_sym_t *name = globalref->name;
     auto from_node_idx = record_node_to_gc_snapshot((jl_value_t*)module);
-    auto to_node_idx = record_pointer_to_gc_snapshot(binding, sizeof(jl_binding_t), jl_symbol_name(name));
-
-    jl_value_t *value = jl_atomic_load_relaxed(&binding->value);
-    auto value_idx = value ? record_node_to_gc_snapshot(value) : 0;
-    jl_value_t *ty = jl_atomic_load_relaxed(&binding->ty);
-    auto ty_idx = ty ? record_node_to_gc_snapshot(ty) : 0;
-    auto globalref_idx = record_node_to_gc_snapshot((jl_value_t*)globalref);
-
+    auto to_bindings_idx = record_node_to_gc_snapshot(bindings);
+    auto to_bindingkeyset_idx = record_node_to_gc_snapshot(bindingkeyset);
     auto &from_node = g_snapshot->nodes[from_node_idx];
-    auto &to_node = g_snapshot->nodes[to_node_idx];
-
-    _record_gc_just_edge("property", from_node, to_node_idx, g_snapshot->names.find_or_create_string_id("<native>"));
-    if (value_idx)     _record_gc_just_edge("internal", to_node, value_idx, g_snapshot->names.find_or_create_string_id("value"));
-    if (ty_idx)        _record_gc_just_edge("internal", to_node, ty_idx, g_snapshot->names.find_or_create_string_id("ty"));
-    if (globalref_idx) _record_gc_just_edge("internal", to_node, globalref_idx, g_snapshot->names.find_or_create_string_id("globalref"));
-}
+    if (to_bindings_idx > 0) {
+        _record_gc_just_edge("internal", from_node, to_bindings_idx, g_snapshot->names.find_or_create_string_id("bindings"));
+    }
+    if (to_bindingkeyset_idx > 0) {
+        _record_gc_just_edge("internal", from_node, to_bindingkeyset_idx, g_snapshot->names.find_or_create_string_id("bindingkeyset"));
+    }
+ }
 
 void _gc_heap_snapshot_record_internal_array_edge(jl_value_t *from, jl_value_t *to) JL_NOTSAFEPOINT
 {
@@ -492,6 +550,8 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one
 
     ios_printf(stream, "\"nodes\":[");
     bool first_node = true;
+    // use a set to track the nodes that do not have parents
+    set<size_t> orphans;
     for (const auto &from_node : snapshot.nodes) {
         if (first_node) {
             first_node = false;
@@ -508,6 +568,14 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one
                             from_node.edges.size(),
                             from_node.trace_node_id,
                             from_node.detachedness);
+        if (from_node.id != snapshot._gc_root_idx && from_node.id != snapshot._gc_finlist_root_idx) {
+            // find the node index from the node object pointer
+            void * ptr = (void*)from_node.id;
+            size_t n_id = snapshot.node_ptr_to_index_map[ptr];
+            orphans.insert(n_id);
+        } else {
+            orphans.insert(from_node.id);
+        }
     }
     ios_printf(stream, "],\n");
 
@@ -525,6 +593,12 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one
                                 edge.type,
                                 edge.name_or_index,
                                 edge.to_node * k_node_number_of_fields);
+            auto n_id = edge.to_node;
+            auto it = orphans.find(n_id);
+            if (it != orphans.end()) {
+                // remove the node from the orphans if it has at least one incoming edge
+                orphans.erase(it);
+            }
         }
     }
     ios_printf(stream, "],\n"); // end "edges"
@@ -534,4 +608,8 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one
     snapshot.names.print_json_array(stream, true);
 
     ios_printf(stream, "}");
+
+    // remove the uber node from the orphans
+    orphans.erase(0);
+    assert(orphans.size() == 0 && "all nodes except the uber node should have at least one incoming edge");
 }
diff --git a/src/gc-heap-snapshot.h b/src/gc-heap-snapshot.h
index 8c3af5b86bec7..1799063825e83 100644
--- a/src/gc-heap-snapshot.h
+++ b/src/gc-heap-snapshot.h
@@ -20,7 +20,7 @@ void _gc_heap_snapshot_record_task_to_frame_edge(jl_task_t *from, void *to) JL_N
 void _gc_heap_snapshot_record_frame_to_frame_edge(jl_gcframe_t *from, jl_gcframe_t *to) JL_NOTSAFEPOINT;
 void _gc_heap_snapshot_record_array_edge(jl_value_t *from, jl_value_t *to, size_t index) JL_NOTSAFEPOINT;
 void _gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_value_t *to, void* slot) JL_NOTSAFEPOINT;
-void _gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_binding_t* binding) JL_NOTSAFEPOINT;
+void _gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT;
 // Used for objects managed by GC, but which aren't exposed in the julia object, so have no
 // field or index.  i.e. they're not reachable from julia code, but we _will_ hit them in
 // the GC mark phase (so we can check their type tag to get the size).
@@ -28,7 +28,10 @@ void _gc_heap_snapshot_record_internal_array_edge(jl_value_t *from, jl_value_t *
 // Used for objects manually allocated in C (outside julia GC), to still tell the heap snapshot about the
 // size of the object, even though we're never going to mark that object.
 void _gc_heap_snapshot_record_hidden_edge(jl_value_t *from, void* to, size_t bytes, uint16_t alloc_type) JL_NOTSAFEPOINT;
-
+// Used for objects that are reachable from the GC roots
+void _gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT;
+// Used for objects that are reachable from the finalizer list
+void _gc_heap_snapshot_record_finlist(jl_value_t *finlist, size_t index) JL_NOTSAFEPOINT;
 
 extern int gc_heap_snapshot_enabled;
 extern int prev_sweep_full;
@@ -60,6 +63,12 @@ static inline void gc_heap_snapshot_record_root(jl_value_t *root, char *name) JL
         _gc_heap_snapshot_record_root(root, name);
     }
 }
+static inline void gc_heap_snapshot_record_array_edge_index(jl_value_t *from, jl_value_t *to, size_t index) JL_NOTSAFEPOINT
+{
+    if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && from != NULL && to != NULL)) {
+        _gc_heap_snapshot_record_array_edge(from, to, index);
+    }
+}
 static inline void gc_heap_snapshot_record_array_edge(jl_value_t *from, jl_value_t **to) JL_NOTSAFEPOINT
 {
     if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full)) {
@@ -73,10 +82,10 @@ static inline void gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_valu
     }
 }
 
-static inline void gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_binding_t* binding) JL_NOTSAFEPOINT
+static inline void gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT
 {
-    if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full)) {
-        _gc_heap_snapshot_record_module_to_binding(module, binding);
+    if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full) && bindings != NULL && bindingkeyset != NULL) {
+        _gc_heap_snapshot_record_module_to_binding(module, bindings, bindingkeyset);
     }
 }
 
@@ -94,6 +103,20 @@ static inline void gc_heap_snapshot_record_hidden_edge(jl_value_t *from, void* t
     }
 }
 
+static inline void gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT
+{
+    if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && root != NULL)) {
+        _gc_heap_snapshot_record_gc_roots(root, name);
+    }
+}
+
+static inline void gc_heap_snapshot_record_finlist(jl_value_t *finlist, size_t index) JL_NOTSAFEPOINT
+{
+    if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && finlist != NULL)) {
+        _gc_heap_snapshot_record_finlist(finlist, index);
+    }
+}
+
 // ---------------------------------------------------------------------
 // Functions to call from Julia to take heap snapshot
 // ---------------------------------------------------------------------
diff --git a/src/gc.c b/src/gc.c
index 0c756d3d2880b..57bd81baf4827 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -2358,9 +2358,10 @@ STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_ch
             break;
         }
         case GC_finlist_chunk: {
+            jl_value_t *fl_parent = c->parent;
             jl_value_t **fl_begin = c->begin;
             jl_value_t **fl_end = c->end;
-            gc_mark_finlist_(mq, fl_begin, fl_end);
+            gc_mark_finlist_(mq, fl_parent, fl_begin, fl_end);
             break;
         }
         default: {
@@ -2458,6 +2459,7 @@ STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent
     jl_value_t *bindingkeyset = (jl_value_t *)jl_atomic_load_relaxed(&mb_parent->bindingkeyset);
     gc_assert_parent_validity((jl_value_t *)mb_parent, bindingkeyset);
     gc_try_claim_and_push(mq, bindingkeyset, &nptr);
+    gc_heap_snapshot_record_module_to_binding(mb_parent, bindings, bindingkeyset);
     gc_assert_parent_validity((jl_value_t *)mb_parent, (jl_value_t *)mb_parent->parent);
     gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr);
     size_t nusings = mb_parent->usings.len;
@@ -2476,7 +2478,7 @@ STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent
     }
 }
 
-void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end)
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end)
 {
     jl_value_t *new_obj;
     // Decide whether need to chunk finlist
@@ -2486,8 +2488,10 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t *
         gc_chunkqueue_push(mq, &c);
         fl_end = fl_begin + GC_CHUNK_BATCH_SIZE;
     }
+    size_t i = 0;
     for (; fl_begin < fl_end; fl_begin++) {
-        new_obj = *fl_begin;
+        jl_value_t **slot = fl_begin;
+        new_obj = *slot;
         if (__unlikely(new_obj == NULL))
             continue;
         if (gc_ptr_tag(new_obj, 1)) {
@@ -2498,6 +2502,13 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t *
         if (gc_ptr_tag(new_obj, 2))
             continue;
         gc_try_claim_and_push(mq, new_obj, NULL);
+        if (fl_parent != NULL) {
+            gc_heap_snapshot_record_array_edge(fl_parent, slot);
+        } else {
+            // This is a list of objects following the same format as a finlist
+            // if `fl_parent` is NULL
+            gc_heap_snapshot_record_finlist(new_obj, ++i);
+        }
     }
 }
 
@@ -2509,7 +2520,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start)
         return;
     jl_value_t **fl_begin = (jl_value_t **)list->items + start;
     jl_value_t **fl_end = (jl_value_t **)list->items + len;
-    gc_mark_finlist_(mq, fl_begin, fl_end);
+    gc_mark_finlist_(mq, NULL, fl_begin, fl_end);
 }
 
 JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
@@ -3158,28 +3169,38 @@ static void gc_mark_roots(jl_gc_markqueue_t *mq)
 {
     // modules
     gc_try_claim_and_push(mq, jl_main_module, NULL);
-    gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module");
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_main_module, "main_module");
     // invisible builtin values
     gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_an_empty_vec_any, "an_empty_vec_any");
     gc_try_claim_and_push(mq, jl_module_init_order, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_module_init_order, "module_init_order");
     for (size_t i = 0; i < jl_current_modules.size; i += 2) {
         if (jl_current_modules.table[i + 1] != HT_NOTFOUND) {
             gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL);
-            gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module");
+            gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_current_modules.table[i], "top level module");
         }
     }
     gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_anytuple_type_type, "anytuple_type_type");
     for (size_t i = 0; i < N_CALL_CACHE; i++) {
         jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]);
         gc_try_claim_and_push(mq, v, NULL);
+        gc_heap_snapshot_record_array_edge_index((jl_value_t*)jl_anytuple_type_type, (jl_value_t*)v, i);
     }
     gc_try_claim_and_push(mq, jl_all_methods, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_all_methods, "all_methods");
     gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)_jl_debug_method_invalidation, "debug_method_invalidation");
     // constants
     gc_try_claim_and_push(mq, jl_emptytuple_type, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_emptytuple_type, "emptytuple_type");
     gc_try_claim_and_push(mq, cmpswap_names, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)cmpswap_names, "cmpswap_names");
     gc_try_claim_and_push(mq, jl_global_roots_list, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_global_roots_list, "global_roots_list");
     gc_try_claim_and_push(mq, jl_global_roots_keyset, NULL);
+    gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_global_roots_keyset, "global_roots_keyset");
 }
 
 // find unmarked objects that need to be finalized from the finalizer list "list".
diff --git a/src/gc.h b/src/gc.h
index 217b8050e40ac..9de67f6e7c679 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -468,7 +468,7 @@ extern uv_sem_t gc_sweep_assists_needed;
 extern _Atomic(int) gc_n_threads_marking;
 extern _Atomic(int) gc_n_threads_sweeping;
 void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
-void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
 void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT;
 void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
 void gc_mark_loop_serial(jl_ptls_t ptls);

From 89cae45ea4f75ce81fff08ca6e731e72e838f4ad Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Sat, 30 Dec 2023 22:38:45 +0530
Subject: [PATCH 13/13] Optimized arithmetic methods for strided triangular
 matrices (#52571)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This uses broadcasting for operations like `A::UpperTriangular +
B::UpperTriangular` in case the parents are `StridedMatrix`es. Looping
only over the triangular part is usually faster for large matrices,
where presumably memory is the bottleneck.

Some performance comparisons, using
```julia
julia> U = UpperTriangular(rand(1000,1000));

julia> U1 = UnitUpperTriangular(rand(size(U)...));
```
| Operation | master | PR |
| --------------- | ---------- | ----- |
|`-U` |`1.011 ms (3 allocations: 7.63 MiB)` |`559.680 μs (3 allocations:
7.63 MiB)` |
|`U + U`/`U - U` |`971.740 μs (3 allocations: 7.63 MiB)` | `560.063 μs
(3 allocations: 7.63 MiB)` |
|`U + U1`/`U - U1` |`3.014 ms (9 allocations: 22.89 MiB)` | `944.772 μs
(3 allocations: 7.63 MiB)` |
|`U1 + U1` |`4.509 ms (12 allocations: 30.52 MiB)` | `1.687 ms (3
allocations: 7.63 MiB)` |
|`U1 - U1` |`3.357 ms (9 allocations: 22.89 MiB)` | `1.763 ms (3
allocations: 7.63 MiB)` |

I've retained the existing methods as fallback, in case there's current
code that works without broadcasting.
---
 stdlib/LinearAlgebra/src/triangular.jl  | 67 +++++++++++++++++++------
 stdlib/LinearAlgebra/test/triangular.jl | 54 ++++++++++++++++++++
 2 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/triangular.jl b/stdlib/LinearAlgebra/src/triangular.jl
index b18b4f0159a67..64e1b320f19dd 100644
--- a/stdlib/LinearAlgebra/src/triangular.jl
+++ b/stdlib/LinearAlgebra/src/triangular.jl
@@ -48,6 +48,7 @@ for t in (:LowerTriangular, :UnitLowerTriangular, :UpperTriangular, :UnitUpperTr
 
         real(A::$t{<:Real}) = A
         real(A::$t{<:Complex}) = (B = real(A.data); $t(B))
+        real(A::$t{<:Complex, <:StridedMaybeAdjOrTransMat}) = $t(real.(A))
     end
 end
 
@@ -156,8 +157,26 @@ const UpperOrLowerTriangular{T,S} = Union{UpperOrUnitUpperTriangular{T,S}, Lower
 
 imag(A::UpperTriangular) = UpperTriangular(imag(A.data))
 imag(A::LowerTriangular) = LowerTriangular(imag(A.data))
-imag(A::UnitLowerTriangular) = LowerTriangular(tril!(imag(A.data),-1))
-imag(A::UnitUpperTriangular) = UpperTriangular(triu!(imag(A.data),1))
+imag(A::UpperTriangular{<:Any,<:StridedMaybeAdjOrTransMat}) = imag.(A)
+imag(A::LowerTriangular{<:Any,<:StridedMaybeAdjOrTransMat}) = imag.(A)
+function imag(A::UnitLowerTriangular)
+    L = LowerTriangular(A.data)
+    Lim = similar(L) # must be mutable to set diagonals to zero
+    Lim .= imag.(L)
+    for i in 1:size(Lim,1)
+        Lim[i,i] = zero(Lim[i,i])
+    end
+    return Lim
+end
+function imag(A::UnitUpperTriangular)
+    U = UpperTriangular(A.data)
+    Uim = similar(U) # must be mutable to set diagonals to zero
+    Uim .= imag.(U)
+    for i in 1:size(Uim,1)
+        Uim[i,i] = zero(Uim[i,i])
+    end
+    return Uim
+end
 
 Array(A::AbstractTriangular) = Matrix(A)
 parent(A::UpperOrLowerTriangular) = A.data
@@ -481,6 +500,11 @@ function -(A::UnitUpperTriangular)
     UpperTriangular(Anew)
 end
 
+# use broadcasting if the parents are strided, where we loop only over the triangular part
+for TM in (:LowerTriangular, :UpperTriangular)
+    @eval -(A::$TM{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast(-, A)
+end
+
 tr(A::LowerTriangular) = tr(A.data)
 tr(A::UnitLowerTriangular) = size(A, 1) * oneunit(eltype(A))
 tr(A::UpperTriangular) = tr(A.data)
@@ -719,6 +743,16 @@ fillstored!(A::UnitUpperTriangular, x) = (fillband!(A.data, x, 1, size(A,2)-1);
 -(A::UnitLowerTriangular, B::UnitLowerTriangular) = LowerTriangular(tril(A.data, -1) - tril(B.data, -1))
 -(A::AbstractTriangular, B::AbstractTriangular) = copyto!(similar(parent(A)), A) - copyto!(similar(parent(B)), B)
 
+# use broadcasting if the parents are strided, where we loop only over the triangular part
+for op in (:+, :-)
+    for TM1 in (:LowerTriangular, :UnitLowerTriangular), TM2 in (:LowerTriangular, :UnitLowerTriangular)
+        @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B)
+    end
+    for TM1 in (:UpperTriangular, :UnitUpperTriangular), TM2 in (:UpperTriangular, :UnitUpperTriangular)
+        @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B)
+    end
+end
+
 ######################
 # BlasFloat routines #
 ######################
@@ -918,47 +952,52 @@ end
 
 for (t, unitt) in ((UpperTriangular, UnitUpperTriangular),
                    (LowerTriangular, UnitLowerTriangular))
+    tstrided = t{<:Any, <:StridedMaybeAdjOrTransMat}
     @eval begin
         (*)(A::$t, x::Number) = $t(A.data*x)
+        (*)(A::$tstrided, x::Number) = A .* x
 
         function (*)(A::$unitt, x::Number)
-            B = A.data*x
+            B = $t(A.data)*x
             for i = 1:size(A, 1)
-                B[i,i] = x
+                B.data[i,i] = x
             end
-            $t(B)
+            return B
         end
 
         (*)(x::Number, A::$t) = $t(x*A.data)
+        (*)(x::Number, A::$tstrided) = x .* A
 
         function (*)(x::Number, A::$unitt)
-            B = x*A.data
+            B = x*$t(A.data)
             for i = 1:size(A, 1)
-                B[i,i] = x
+                B.data[i,i] = x
             end
-            $t(B)
+            return B
         end
 
         (/)(A::$t, x::Number) = $t(A.data/x)
+        (/)(A::$tstrided, x::Number) = A ./ x
 
         function (/)(A::$unitt, x::Number)
-            B = A.data/x
+            B = $t(A.data)/x
             invx = inv(x)
             for i = 1:size(A, 1)
-                B[i,i] = invx
+                B.data[i,i] = invx
             end
-            $t(B)
+            return B
         end
 
         (\)(x::Number, A::$t) = $t(x\A.data)
+        (\)(x::Number, A::$tstrided) = x .\ A
 
         function (\)(x::Number, A::$unitt)
-            B = x\A.data
+            B = x\$t(A.data)
             invx = inv(x)
             for i = 1:size(A, 1)
-                B[i,i] = invx
+                B.data[i,i] = invx
             end
-            $t(B)
+            return B
         end
     end
 end
diff --git a/stdlib/LinearAlgebra/test/triangular.jl b/stdlib/LinearAlgebra/test/triangular.jl
index 74e1028bf109d..b60efba1d941a 100644
--- a/stdlib/LinearAlgebra/test/triangular.jl
+++ b/stdlib/LinearAlgebra/test/triangular.jl
@@ -526,6 +526,23 @@ for elty1 in (Float32, Float64, BigFloat, ComplexF32, ComplexF64, Complex{BigFlo
     end
 end
 
+@testset "non-strided arithmetic" begin
+    for (T,T1) in ((UpperTriangular, UnitUpperTriangular), (LowerTriangular, UnitLowerTriangular))
+        U = T(reshape(1:16, 4, 4))
+        M = Matrix(U)
+        @test -U == -M
+        U1 = T1(reshape(1:16, 4, 4))
+        M1 = Matrix(U1)
+        @test -U1 == -M1
+        for op in (+, -)
+            for (A, MA) in ((U, M), (U1, M1)), (B, MB) in ((U, M), (U1, M1))
+                @test op(A, B) == op(MA, MB)
+            end
+        end
+        @test imag(U) == zero(U)
+    end
+end
+
 # Matrix square root
 Atn = UpperTriangular([-1 1 2; 0 -2 2; 0 0 -3])
 Atp = UpperTriangular([1 1 2; 0 2 2; 0 0 3])
@@ -894,6 +911,11 @@ end
         U = UT(F)
         @test -U == -Array(U)
     end
+
+    F = FillArrays.Fill(3im, (4,4))
+    for U in (UnitUpperTriangular(F), UnitLowerTriangular(F))
+        @test imag(F) == imag(collect(F))
+    end
 end
 
 @testset "error paths" begin
@@ -911,4 +933,36 @@ end
     end
 end
 
+@testset "arithmetic with partly uninitialized matrices" begin
+    @testset "$(typeof(A))" for A in (Matrix{BigFloat}(undef,2,2), Matrix{Complex{BigFloat}}(undef,2,2)')
+        A[1,1] = A[2,2] = A[2,1] = 4
+        B = Matrix{eltype(A)}(undef, size(A))
+        for MT in (LowerTriangular, UnitLowerTriangular)
+            L = MT(A)
+            B .= 0
+            copyto!(B, L)
+            @test L * 2 == 2 * L == 2B
+            @test L/2 == B/2
+            @test 2\L == 2\B
+            @test real(L) == real(B)
+            @test imag(L) == imag(B)
+        end
+    end
+
+    @testset "$(typeof(A))" for A in (Matrix{BigFloat}(undef,2,2), Matrix{Complex{BigFloat}}(undef,2,2)')
+        A[1,1] = A[2,2] = A[1,2] = 4
+        B = Matrix{eltype(A)}(undef, size(A))
+        for MT in (UpperTriangular, UnitUpperTriangular)
+            U = MT(A)
+            B .= 0
+            copyto!(B, U)
+            @test U * 2 == 2 * U == 2B
+            @test U/2 == B/2
+            @test 2\U == 2\B
+            @test real(U) == real(B)
+            @test imag(U) == imag(B)
+        end
+    end
+end
+
 end # module TestTriangular