From e7e8b8900d471f38f90d7150c5123c503aa1ac36 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Wed, 27 Dec 2023 15:59:53 -0300 Subject: [PATCH 01/13] update --gcthreads section in command line options (#52645) Make these consistent with what's shown by `julia --help`. Fixes https://github.com/JuliaLang/www.julialang.org/issues/1997. --- doc/man/julia.1 | 6 +++--- doc/src/manual/command-line-interface.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/man/julia.1 b/doc/man/julia.1 index b0af07539cb41..ee0f973c259fd 100644 --- a/doc/man/julia.1 +++ b/doc/man/julia.1 @@ -123,9 +123,9 @@ process affinity is not configured, it uses the number of CPU threads. .TP ---gcthreads -Enable n GC threads; If unspecified is set to half of the -compute worker threads. +--gcthreads=N[,M] +Use N threads for the mark phase of GC and M (0 or 1) threads for the concurrent sweeping phase of GC. +N is set to half of the number of compute threads and M is set to 0 if unspecified. .TP -p, --procs {N|auto} diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md index c69d221a80f2e..be442e456c2e0 100644 --- a/doc/src/manual/command-line-interface.md +++ b/doc/src/manual/command-line-interface.md @@ -174,7 +174,7 @@ The following is a complete list of command-line switches available when launchi |`-E`, `--print ` |Evaluate `` and display the result| |`-L`, `--load ` |Load `` immediately on all processors| |`-t`, `--threads {N\|auto}` |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| -| `--gcthreads {N}` |Enable N GC threads; If unspecified is set to half of the compute worker threads.| +| `--gcthreads=N[,M]` |Use N threads for the mark phase of GC and M (0 or 1) threads for the concurrent sweeping phase of GC. N is set to half of the number of compute threads and M is set to 0 if unspecified.| |`-p`, `--procs {N\|auto}` |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| |`--machine-file ` |Run processes on hosts listed in ``| |`-i` |Interactive mode; REPL runs and `isinteractive()` is true| From e6e572eb2c316734a7db08e87a060b137d0fe410 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Thu, 28 Dec 2023 07:50:02 +0600 Subject: [PATCH 02/13] Temporarily remove failing sorting tests (#52643) Tracked by issue #52642 --- test/sorting.jl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/test/sorting.jl b/test/sorting.jl index 621c82ae7e7fe..35635d8588f93 100644 --- a/test/sorting.jl +++ b/test/sorting.jl @@ -1061,14 +1061,15 @@ end # it would be too much trouble to actually construct a valid pathological input, so we # construct an invalid pathological input. # This test is kind of sketchy because it passes invalid inputs to the function - for i in [1:6, 1:483, 1:957, 77:86, 118:478, 223:227, 231:970, 317:958, 500:501, 500:501, 500:501, 614:620, 632:635, 658:665, 933:940, 937:942, 997:1000, 999:1000] - x = rand(1:5, 1000) - @test partialsort(x, i, lt=(<=)) == sort(x)[i] - end - for i in [1, 7, 8, 490, 495, 852, 993, 996, 1000] - x = rand(1:5, 1000) - @test partialsort(x, i, lt=(<=)) == sort(x)[i] - end + # Temporarily removed due to flakey test failures. See #52642 for details. + # for i in [1:6, 1:483, 1:957, 77:86, 118:478, 223:227, 231:970, 317:958, 500:501, 500:501, 500:501, 614:620, 632:635, 658:665, 933:940, 937:942, 997:1000, 999:1000] + # x = rand(1:5, 1000) + # @test partialsort(x, i, lt=(<=)) == sort(x)[i] + # end + # for i in [1, 7, 8, 490, 495, 852, 993, 996, 1000] + # x = rand(1:5, 1000) + # @test partialsort(x, i, lt=(<=)) == sort(x)[i] + # end end # This testset is at the end of the file because it is slow. From 26d04607c14615631e56973cd30b4fd04d3f4af8 Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Thu, 28 Dec 2023 02:52:18 -0500 Subject: [PATCH 03/13] Show more info on why package precompilation was needed (#52619) --- base/loading.jl | 53 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/base/loading.jl b/base/loading.jl index 81778d41893c0..1519375099634 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -1633,11 +1633,11 @@ end # returns `nothing` if require found a precompile cache for this sourcepath, but couldn't load it # returns the set of modules restored if the cache load succeeded -@constprop :none function _require_search_from_serialized(pkg::PkgId, sourcepath::String, build_id::UInt128) +@constprop :none function _require_search_from_serialized(pkg::PkgId, sourcepath::String, build_id::UInt128; reasons=nothing) assert_havelock(require_lock) paths = find_all_in_cache_path(pkg) for path_to_try in paths::Vector{String} - staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try) + staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try; reasons) if staledeps === true continue end @@ -2126,11 +2126,11 @@ function _require(pkg::PkgId, env=nothing) set_pkgorigin_version_path(pkg, path) pkg_precompile_attempted = false # being safe to avoid getting stuck in a Pkg.precompile loop - + reasons = Dict{String,Int}() # attempt to load the module file via the precompile cache locations if JLOptions().use_compiled_modules != 0 @label load_from_cache - m = _require_search_from_serialized(pkg, path, UInt128(0)) + m = _require_search_from_serialized(pkg, path, UInt128(0); reasons) if m isa Module return m end @@ -2166,7 +2166,7 @@ function _require(pkg::PkgId, env=nothing) # double-check now that we have lock m = _require_search_from_serialized(pkg, path, UInt128(0)) m isa Module && return m - compilecache(pkg, path) + compilecache(pkg, path; reasons) end cachefile_or_module isa Module && return cachefile_or_module::Module cachefile = cachefile_or_module @@ -2568,17 +2568,17 @@ This can be used to reduce package load times. Cache files are stored in `DEPOT_PATH[1]/compiled`. See [Module initialization and precompilation](@ref) for important notes. """ -function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout) +function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout; reasons::Union{Dict{String,Int},Nothing}=nothing) @nospecialize internal_stderr internal_stdout path = locate_package(pkg) path === nothing && throw(ArgumentError("$pkg not found during precompilation")) - return compilecache(pkg, path, internal_stderr, internal_stdout) + return compilecache(pkg, path, internal_stderr, internal_stdout; reasons) end const MAX_NUM_PRECOMPILE_FILES = Ref(10) function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, internal_stdout::IO = stdout, - keep_loaded_modules::Bool = true) + keep_loaded_modules::Bool = true; reasons::Union{Dict{String,Int},Nothing}=nothing) @nospecialize internal_stderr internal_stdout # decide where to put the resulting cache file @@ -2595,7 +2595,7 @@ function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, in end # run the expression and cache the result verbosity = isinteractive() ? CoreLogging.Info : CoreLogging.Debug - @logmsg verbosity "Precompiling $pkg" + @logmsg verbosity "Precompiling $pkg $(list_reasons(reasons))" # create a temporary file in `cachepath` directory, write the cache in it, # write the checksum, _and then_ atomically move the file to `cachefile`. @@ -3311,17 +3311,29 @@ function maybe_cachefile_lock(f, pkg::PkgId, srcpath::String; stale_age=compilec f() end end + +function record_reason(reasons::Dict{String,Int}, reason::String) + reasons[reason] = get(reasons, reason, 0) + 1 +end +record_reason(::Nothing, ::String) = nothing +function list_reasons(reasons::Dict{String,Int}) + isempty(reasons) && return "" + return "(cache misses: $(join(("$k ($v)" for (k,v) in reasons), ", ")))" +end + # returns true if it "cachefile.ji" is stale relative to "modpath.jl" and build_id for modkey # otherwise returns the list of dependencies to also check -@constprop :none function stale_cachefile(modpath::String, cachefile::String; ignore_loaded::Bool = false) +@constprop :none function stale_cachefile(modpath::String, cachefile::String; ignore_loaded::Bool = false, reasons=nothing) return stale_cachefile(PkgId(""), UInt128(0), modpath, cachefile; ignore_loaded) end -@constprop :none function stale_cachefile(modkey::PkgId, build_id::UInt128, modpath::String, cachefile::String; ignore_loaded::Bool = false) +@constprop :none function stale_cachefile(modkey::PkgId, build_id::UInt128, modpath::String, cachefile::String; + ignore_loaded::Bool = false, reasons::Union{Dict{String,Int},Nothing}=nothing) io = open(cachefile, "r") try checksum = isvalid_cache_header(io) if iszero(checksum) @debug "Rejecting cache file $cachefile due to it containing an invalid cache header" + record_reason(reasons, "invalid header") return true # invalid cache file end modules, (includes, _, requires), required_modules, srctextpos, prefs, prefs_hash, clone_targets, flags = parse_cache_header(io, cachefile) @@ -3334,6 +3346,7 @@ end current session: $(CacheFlags()) cache file: $(CacheFlags(flags)) """ + record_reason(reasons, "mismatched flags") return true end pkgimage = !isempty(clone_targets) @@ -3342,6 +3355,7 @@ end if JLOptions().use_pkgimages == 0 # presence of clone_targets means native code cache @debug "Rejecting cache file $cachefile for $modkey since it would require usage of pkgimage" + record_reason(reasons, "requires pkgimages") return true end rejection_reasons = check_clone_targets(clone_targets) @@ -3350,10 +3364,12 @@ end Reasons=rejection_reasons, var"Image Targets"=parse_image_targets(clone_targets), var"Current Targets"=current_image_targets()) + record_reason(reasons, "target mismatch") return true end if !isfile(ocachefile) @debug "Rejecting cache file $cachefile for $modkey since pkgimage $ocachefile was not found" + record_reason(reasons, "missing ocachefile") return true end else @@ -3362,12 +3378,14 @@ end id = first(modules) if id.first != modkey && modkey != PkgId("") @debug "Rejecting cache file $cachefile for $modkey since it is for $id instead" + record_reason(reasons, "for different pkgid") return true end if build_id != UInt128(0) id_build = (UInt128(checksum) << 64) | id.second if id_build != build_id @debug "Ignoring cache file $cachefile for $modkey ($((UUID(id_build)))) since it does not provide desired build_id ($((UUID(build_id))))" + record_reason(reasons, "for different buildid") return true end end @@ -3389,6 +3407,7 @@ end @goto locate_branch else @debug "Rejecting cache file $cachefile because module $req_key is already loaded and incompatible." + record_reason(reasons, req_key == PkgId(Core) ? "wrong julia version" : "wrong dep version loaded") return true # Won't be able to fulfill dependency end else @@ -3396,6 +3415,7 @@ end path = locate_package(req_key) if path === nothing @debug "Rejecting cache file $cachefile because dependency $req_key not found." + record_reason(reasons, "dep missing source") return true # Won't be able to fulfill dependency end depmods[i] = (path, req_key, req_build_id) @@ -3415,6 +3435,7 @@ end break end @debug "Rejecting cache file $cachefile because it provides the wrong build_id (got $((UUID(build_id)))) for $req_key (want $(UUID(req_build_id)))" + record_reason(reasons, "wrong dep buildid") return true # cachefile doesn't provide the required version of the dependency end end @@ -3423,6 +3444,7 @@ end if !skip_check if !samefile(includes[1].filename, modpath) && !samefile(fixup_stdlib_path(includes[1].filename), modpath) @debug "Rejecting cache file $cachefile because it is for file $(includes[1].filename) not file $modpath" + record_reason(reasons, "wrong source") return true # cache file was compiled from a different path end for (modkey, req_modkey) in requires @@ -3430,6 +3452,7 @@ end pkg = identify_package(modkey, req_modkey.name) if pkg != req_modkey @debug "Rejecting cache file $cachefile because uuid mapping for $modkey => $req_modkey has changed, expected $modkey => $pkg" + record_reason(reasons, "dep uuid changed") return true end end @@ -3437,6 +3460,7 @@ end f, fsize_req, hash_req, ftime_req = chi.filename, chi.fsize, chi.hash, chi.mtime if startswith(f, "@depot/") @debug("Rejecting stale cache file $cachefile because its depot could not be resolved") + record_reason(reasons, "nonresolveable depot") return true end if !ispath(f) @@ -3445,6 +3469,7 @@ end continue end @debug "Rejecting stale cache file $cachefile because file $f does not exist" + record_reason(reasons, "missing sourcefile") return true end if ftime_req >= 0.0 @@ -3458,17 +3483,20 @@ end !( 0 < (ftime_req - ftime) < 1e-6 ) # PR #45552: Compensate for Windows tar giving mtimes that may be incorrect by up to one microsecond if is_stale @debug "Rejecting stale cache file $cachefile because mtime of include_dependency $f has changed (mtime $ftime, before $ftime_req)" + record_reason(reasons, "include_dependency mtime change") return true end else fsize = filesize(f) if fsize != fsize_req @debug "Rejecting stale cache file $cachefile because file size of $f has changed (file size $fsize, before $fsize_req)" + record_reason(reasons, "include_dependency fsize change") return true end hash = open(_crc32c, f, "r") if hash != hash_req @debug "Rejecting stale cache file $cachefile because hash of $f has changed (hash $hash, before $hash_req)" + record_reason(reasons, "include_dependency fhash change") return true end end @@ -3477,12 +3505,14 @@ end if !isvalid_file_crc(io) @debug "Rejecting cache file $cachefile because it has an invalid checksum" + record_reason(reasons, "invalid checksum") return true end if pkgimage if !isvalid_pkgimage_crc(io, ocachefile::String) @debug "Rejecting cache file $cachefile because $ocachefile has an invalid checksum" + record_reason(reasons, "ocachefile invalid checksum") return true end end @@ -3490,6 +3520,7 @@ end curr_prefs_hash = get_preferences_hash(id.uuid, prefs) if prefs_hash != curr_prefs_hash @debug "Rejecting cache file $cachefile because preferences hash does not match 0x$(string(prefs_hash, base=16)) != 0x$(string(curr_prefs_hash, base=16))" + record_reason(reasons, "preferences hash mismatch") return true end From e96c13aa5b2e2b41e618ac009d43e1fdea0a70d4 Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Thu, 28 Dec 2023 07:42:28 -0500 Subject: [PATCH 04/13] update nthreads info in versioninfo (#52423) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/JuliaLang/julia/issues/52404 @nilshg I opted to make it one line as it fits. ``` julia> versioninfo() Julia Version 1.11.0-DEV.1011 Commit bb7091c6f2* (2023-12-04 14:58 UTC) Platform Info: OS: macOS (arm64-apple-darwin23.0.0) CPU: 10 × Apple M2 Pro WORD_SIZE: 64 LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1) Threads: 1 default, 0 interactive, 1 GC (on 6 virtual cores) Environment: JULIA_EDITOR = code ``` --- stdlib/InteractiveUtils/src/InteractiveUtils.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl index 629cf88cd8ce6..9927af1a4db9a 100644 --- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl +++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl @@ -160,7 +160,8 @@ function versioninfo(io::IO=stdout; verbose::Bool=false) end println(io, " WORD_SIZE: ", Sys.WORD_SIZE) println(io, " LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")") - println(io, " Threads: ", Threads.maxthreadid(), " on ", Sys.CPU_THREADS, " virtual cores") + println(io, """Threads: $(Threads.nthreads(:default)) default, $(Threads.nthreads(:interactive)) interactive, \ + $(Threads.ngcthreads()) GC (on $(Sys.CPU_THREADS) virtual cores)""") function is_nonverbose_env(k::String) return occursin(r"^JULIA_|^DYLD_|^LD_", k) From 90ae544fb57acdebd5b6ecd592ebd68c3ee629ed Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 28 Dec 2023 14:18:07 -0500 Subject: [PATCH 05/13] fix typo in NEWS (#52652) Typo from #52461. --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 958c51e3f840f..d5ef405b7a913 100644 --- a/NEWS.md +++ b/NEWS.md @@ -84,9 +84,9 @@ New library features write the output to a stream rather than returning a string ([#48625]). * `sizehint!(s, n)` now supports an optional `shrink` argument to disable shrinking ([#51929]). * New function `Docs.hasdoc(module, symbol)` tells whether a name has a docstring ([#52139]). -* Passing an IOBuffer as a stdout argument for Process spawn now works as +* Passing an `IOBuffer` as a stdout argument for `Process` spawn now works as expected, synchronized with `wait` or `success`, so a `Base.BufferStream` is - no longer required there for correctness to avoid data-races ([#TBD]). + no longer required there for correctness to avoid data races ([#52461]). * After a process exits, `closewrite` will no longer be automatically called on the stream passed to it. Call `wait` on the process instead to ensure the content is fully written, then call `closewrite` manually to avoid From ad3769e86d8657ea5cbb566c8826f7c0dbf10696 Mon Sep 17 00:00:00 2001 From: Neven Sajko Date: Thu, 28 Dec 2023 22:50:59 +0100 Subject: [PATCH 06/13] `Base`: make `Tuple(::Pair)` type-stable (#52650) Fixes #52636 --- base/tuple.jl | 1 + test/tuple.jl | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/base/tuple.jl b/base/tuple.jl index 2e1c5972c407d..19b88234cbb00 100644 --- a/base/tuple.jl +++ b/base/tuple.jl @@ -417,6 +417,7 @@ _totuple(::Type{Tuple}, itr, s...) = (collect(Iterators.rest(itr,s...))...,) _totuple(::Type{Tuple}, itr::Array) = (itr...,) _totuple(::Type{Tuple}, itr::SimpleVector) = (itr...,) _totuple(::Type{Tuple}, itr::NamedTuple) = (itr...,) +_totuple(::Type{Tuple}, p::Pair) = (p.first, p.second) _totuple(::Type{Tuple}, x::Number) = (x,) # to make Tuple(x) inferable end diff --git a/test/tuple.jl b/test/tuple.jl index b806667fd9d0a..3a8fb51051737 100644 --- a/test/tuple.jl +++ b/test/tuple.jl @@ -807,3 +807,9 @@ namedtup = (;a=1, b=2, c=3) @test Val{Tuple{Vararg{T,4}} where T} === Val{Tuple{T,T,T,T} where T} @test Val{Tuple{Int64, Vararg{Int32,N}} where N} === Val{Tuple{Int64, Vararg{Int32}}} @test Val{Tuple{Int32, Vararg{Int64}}} === Val{Tuple{Int32, Vararg{Int64,N}} where N} + +@testset "from Pair, issue #52636" begin + pair = (1 => "2") + @test (1, "2") == @inferred Tuple(pair) + @test (1, "2") == @inferred Tuple{Int,String}(pair) +end From e8f89682d7b434f1159626a213756b3691f48d03 Mon Sep 17 00:00:00 2001 From: Bhuminjay Soni <76656712+11happy@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:03:19 +0530 Subject: [PATCH 07/13] Added Tests for Permute function in combinatorics.jl (#52648) Signed-off-by: happy Co-authored-by: happy --- test/combinatorics.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/combinatorics.jl b/test/combinatorics.jl index 25a444b70ec36..862e3bfa37e1e 100644 --- a/test/combinatorics.jl +++ b/test/combinatorics.jl @@ -129,3 +129,24 @@ end end end end + +@testset "permute!" begin + #simple array + @test permute!([1,2,3,4,5],[3,2,1,5,4]) == [3,2,1,5,4] + #empty array + @test permute!([],[]) == [] + #single-element array + @test permute!([5],[1]) == [5] + #repeated elements in array + @test permute!([1,2,2,3,3,3],[2,1,3,5,4,6]) == [2,1,2,3,3,3] + #permutation vector contains zero + @test_throws BoundsError permute!([1,2,3],[0,1,2]) + #permutation vector contains negative indices + @test_throws BoundsError permute!([1,2,3],[2,-1,1]) + #permutation vector contains indices larger than array size + @test_throws BoundsError permute!([1,2,3],[2,4,1]) + #permutation vector is empty + @test_throws DimensionMismatch permute!([1,2,3],[]) + #array is empty + @test_throws BoundsError permute!([],[2,1]) +end From 2b2f5344138f2b50ee91132592733743f33c5358 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 29 Dec 2023 20:37:46 -0300 Subject: [PATCH 08/13] minor fix in malloc terminology used in docs (#52665) --- doc/src/manual/performance-tips.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md index 68ee5132f8592..579413e808f7b 100644 --- a/doc/src/manual/performance-tips.md +++ b/doc/src/manual/performance-tips.md @@ -94,8 +94,8 @@ a vector of 64-bit floats so there should be no need to allocate (heap) memory. We should clarify that what `@time` reports is specifically *heap* allocations, which are typically needed for either mutable objects or for creating/growing variable-sized containers (such as `Array` or `Dict`, strings, or "type-unstable" -objects whose type is only known at runtime). Allocating (or deallocating) such blocks of memory may require an expensive -system call (e.g. via `malloc` in C), and they must be tracked for garbage collection. In contrast, immutable values like +objects whose type is only known at runtime). Allocating (or deallocating) such blocks of memory may require an expensive function +call to libc (e.g. via `malloc` in C), and they must be tracked for garbage collection. In contrast, immutable values like numbers (except bignums), tuples, and immutable `struct`s can be stored much more cheaply, e.g. in stack or CPU-register memory, so one doesn’t typically worry about the performance cost of "allocating" them. From 3f4cfc6d19bfe684223c3f27771d1839eafe6aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Sat, 30 Dec 2023 10:04:07 +0100 Subject: [PATCH 09/13] [dSFMT_jll] Upgrade to v2.2.5 (#52667) Usual memo to self: * update version number in `stdlib/dSFMT_jll/Project.toml` * refresh checksums with `make -f contrib/refresh_checksums.mk -j dsfmt` * update version number in `deps/checksums/dsfmt` --- deps/checksums/dsfmt | 68 +++++++++++++++++------------------ deps/dsfmt.version | 4 ++- stdlib/dSFMT_jll/Project.toml | 2 +- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/deps/checksums/dsfmt b/deps/checksums/dsfmt index 63c7e26f0eb43..0666e51efa994 100644 --- a/deps/checksums/dsfmt +++ b/deps/checksums/dsfmt @@ -1,34 +1,34 @@ -dSFMT.v2.2.4+4.aarch64-apple-darwin.tar.gz/md5/43b52709b7794c92931286174854c886 -dSFMT.v2.2.4+4.aarch64-apple-darwin.tar.gz/sha512/018b67a06cdf42dda2a906025e8a12e026af9b39fe8281890dc90d66a422c3af2a8430d42677f79d123fd0ab0e8d5c37db2e0a00ef03731d35cbb65f9e59b108 -dSFMT.v2.2.4+4.aarch64-linux-gnu.tar.gz/md5/260e14855dbc7773a2ca906d58cc57f2 -dSFMT.v2.2.4+4.aarch64-linux-gnu.tar.gz/sha512/820ca4c6afde931e855b74015150f4ffbb513276c3fa7dbcc1ec8d34c02d4989fb7424a6e4f81f93d054811b5f54f8633d955b05acdb088387ee90f1c3b00915 -dSFMT.v2.2.4+4.aarch64-linux-musl.tar.gz/md5/7ddccbad6b5c9de4be187fe76637a0d8 -dSFMT.v2.2.4+4.aarch64-linux-musl.tar.gz/sha512/e3c225da00927096e3a6cd4abc681fba8f469cb74828e7054d4f5684d71dcb8e75c9a81f14fa10bfbb78f62f9567a31a92edcca8d797e5810a2a44a3fc17bc84 -dSFMT.v2.2.4+4.armv6l-linux-gnueabihf.tar.gz/md5/a70329e0a6c57009c6b6950fd34089f6 -dSFMT.v2.2.4+4.armv6l-linux-gnueabihf.tar.gz/sha512/4418c42165660adc050e872ef834f920c89ed6a0d2b816821672b1e862e947aad7efd023289da9bf05bb2eb9ec4b9d2561c403e2d5384d5314a4ba016b1f9cfc -dSFMT.v2.2.4+4.armv6l-linux-musleabihf.tar.gz/md5/6ffc798b8a0c847fa5cb93640bd66ab3 -dSFMT.v2.2.4+4.armv6l-linux-musleabihf.tar.gz/sha512/94e5ae07d0b1420abd7290519bce6f77deae634bbb4df31e3f02416bf509e555a9b1c9d19dd77ca76a308c2b86d5c9d4718b9ef83c13167b88a8181d8ca7e73a -dSFMT.v2.2.4+4.armv7l-linux-gnueabihf.tar.gz/md5/660d95aa08580ca1716a89c4d8b1eb24 -dSFMT.v2.2.4+4.armv7l-linux-gnueabihf.tar.gz/sha512/bc757a9f805047be5375f92c10a3f3eab69345a4ec5cc997f763e66be36144a74d414ff926df8e17b9d5a2394189269c3188c55e0b7c75a72495394d65510cef -dSFMT.v2.2.4+4.armv7l-linux-musleabihf.tar.gz/md5/78c487049092fe61949d506637c713bb -dSFMT.v2.2.4+4.armv7l-linux-musleabihf.tar.gz/sha512/03ddada4478f05eab7d2971b2deaf2cba91f084d7ce66fc8219bcb3cf5c308ea13959fed95568ca80f4ce11794e197092984919265716de8f2558e2cb30d94ce -dSFMT.v2.2.4+4.i686-linux-gnu.tar.gz/md5/b0f535336cca76f1dcdacca29c6f8410 -dSFMT.v2.2.4+4.i686-linux-gnu.tar.gz/sha512/cc03a246b32875037a41a45c1004834abc7c67f90bf17e1b41cc604ee9893147b1ca3978a2e103b94c94ac617380570473de1f66bff15de8e4ee05c5a3c21059 -dSFMT.v2.2.4+4.i686-linux-musl.tar.gz/md5/a61405f72c9a3bba5718f078c68e61a5 -dSFMT.v2.2.4+4.i686-linux-musl.tar.gz/sha512/726f130bbbfd0dece4185b89a25a73f3b5b950ebfb7f86aea6e9cbcf9ae932e591d20b854de0b4985103dbf8b4b7cb3560661c5070af971cd2c1f3ec3e1ea7d2 -dSFMT.v2.2.4+4.i686-w64-mingw32.tar.gz/md5/93670f43a98f7c6045427dc9ddd89a4a -dSFMT.v2.2.4+4.i686-w64-mingw32.tar.gz/sha512/b76c2be073312ffec8c778b83d3e37b5d0c5dba770ffcc95a6ebfba516a948beb08419428192fcd5dda83357a64e90c4e3a40144688f128133400284a7363b8e -dSFMT.v2.2.4+4.powerpc64le-linux-gnu.tar.gz/md5/fd8c73961ef7c82201e6d86e8bf4324c -dSFMT.v2.2.4+4.powerpc64le-linux-gnu.tar.gz/sha512/1bd0ebd019cfc6f25f7ba007547c5ee297854655b93c55e90d8ead420875de5a087e38956693d5e901ff2abf667c72aa66fb34f587b82adf4b91b3d5d666b5c7 -dSFMT.v2.2.4+4.x86_64-apple-darwin.tar.gz/md5/b57ec1491ffdd40c72860b9f1869160c -dSFMT.v2.2.4+4.x86_64-apple-darwin.tar.gz/sha512/c3a192dbcd3e768712d12d3ac851f46bfa1517eca16c9a187025553076c8fb886b925e4e3f5f20531180f72b73e7eaa5281f54d5b7d4e6a4d53f4542c4bb33b6 -dSFMT.v2.2.4+4.x86_64-linux-gnu.tar.gz/md5/fa671f4ca14b171d53c8866d03f9162a -dSFMT.v2.2.4+4.x86_64-linux-gnu.tar.gz/sha512/2e242a1448da0508ea88cc1a106f1e74f8d7e7562cd82b80d86abf9a8b454653ad7612e25c30ce00c23757e8a5b7b5736253b00a52f9473af6c5d4df768138f2 -dSFMT.v2.2.4+4.x86_64-linux-musl.tar.gz/md5/c648294163882ec539ab646542c74880 -dSFMT.v2.2.4+4.x86_64-linux-musl.tar.gz/sha512/9e96a47d660854b6517364f0db40a2f4e0e3b814499a0349f7cf550b1c8d04589fca5eb4a75bf34f36d1b5d1b2277b3e9a961c887092abedd08f438e025329e7 -dSFMT.v2.2.4+4.x86_64-unknown-freebsd.tar.gz/md5/b4497d34d72ce134ce110b6185a82393 -dSFMT.v2.2.4+4.x86_64-unknown-freebsd.tar.gz/sha512/23d0fb273edbb5a08920a3683398e11d6f4df137dabcfc5f395a9175ddf14ab8999eb961ae8f4b76715a5a2dd2b77757f752abce35c1f752b800201e93aae874 -dSFMT.v2.2.4+4.x86_64-w64-mingw32.tar.gz/md5/d380963292bc54d27d39a3f94adbd5ac -dSFMT.v2.2.4+4.x86_64-w64-mingw32.tar.gz/sha512/ef2f99b17b1a36e61fb4d149d8a8fccc9e804b3b727f2426fca917c265c2d7ada4e3abaa5383e25136dec8de262c1d11970a01d7cfb513a55f1d86a23534e864 -dsfmt-2.2.4.tar.gz/md5/ed30e63552d62df48d709dde4f755660 -dsfmt-2.2.4.tar.gz/sha512/fe84e986cbf198172340adfac0436b08f087643eca3f1ceccacde146cbfd8c41e3eb0dfbb062f7ca5f462db13c386abd7c269bc0cbefc9a0ecf97a8a8870a2e4 +dSFMT.v2.2.5+0.aarch64-apple-darwin.tar.gz/md5/36284767f523bb297633d7da17a7db5a +dSFMT.v2.2.5+0.aarch64-apple-darwin.tar.gz/sha512/e6434c154db4c7187f227a550b159a8db8cfffc514323ca31112744a80a007ba5c95f2274cee30c0aa8caf1b20fb643cb814651a622b8e4bb2e5652878e504d2 +dSFMT.v2.2.5+0.aarch64-linux-gnu.tar.gz/md5/260e14855dbc7773a2ca906d58cc57f2 +dSFMT.v2.2.5+0.aarch64-linux-gnu.tar.gz/sha512/820ca4c6afde931e855b74015150f4ffbb513276c3fa7dbcc1ec8d34c02d4989fb7424a6e4f81f93d054811b5f54f8633d955b05acdb088387ee90f1c3b00915 +dSFMT.v2.2.5+0.aarch64-linux-musl.tar.gz/md5/7ddccbad6b5c9de4be187fe76637a0d8 +dSFMT.v2.2.5+0.aarch64-linux-musl.tar.gz/sha512/e3c225da00927096e3a6cd4abc681fba8f469cb74828e7054d4f5684d71dcb8e75c9a81f14fa10bfbb78f62f9567a31a92edcca8d797e5810a2a44a3fc17bc84 +dSFMT.v2.2.5+0.armv6l-linux-gnueabihf.tar.gz/md5/a70329e0a6c57009c6b6950fd34089f6 +dSFMT.v2.2.5+0.armv6l-linux-gnueabihf.tar.gz/sha512/4418c42165660adc050e872ef834f920c89ed6a0d2b816821672b1e862e947aad7efd023289da9bf05bb2eb9ec4b9d2561c403e2d5384d5314a4ba016b1f9cfc +dSFMT.v2.2.5+0.armv6l-linux-musleabihf.tar.gz/md5/6ffc798b8a0c847fa5cb93640bd66ab3 +dSFMT.v2.2.5+0.armv6l-linux-musleabihf.tar.gz/sha512/94e5ae07d0b1420abd7290519bce6f77deae634bbb4df31e3f02416bf509e555a9b1c9d19dd77ca76a308c2b86d5c9d4718b9ef83c13167b88a8181d8ca7e73a +dSFMT.v2.2.5+0.armv7l-linux-gnueabihf.tar.gz/md5/660d95aa08580ca1716a89c4d8b1eb24 +dSFMT.v2.2.5+0.armv7l-linux-gnueabihf.tar.gz/sha512/bc757a9f805047be5375f92c10a3f3eab69345a4ec5cc997f763e66be36144a74d414ff926df8e17b9d5a2394189269c3188c55e0b7c75a72495394d65510cef +dSFMT.v2.2.5+0.armv7l-linux-musleabihf.tar.gz/md5/78c487049092fe61949d506637c713bb +dSFMT.v2.2.5+0.armv7l-linux-musleabihf.tar.gz/sha512/03ddada4478f05eab7d2971b2deaf2cba91f084d7ce66fc8219bcb3cf5c308ea13959fed95568ca80f4ce11794e197092984919265716de8f2558e2cb30d94ce +dSFMT.v2.2.5+0.i686-linux-gnu.tar.gz/md5/11463fd3981a8c143d7aed691d18d4e0 +dSFMT.v2.2.5+0.i686-linux-gnu.tar.gz/sha512/db946a4fbd8a3163b8b1c25e02bfc4a841da7d2532892a99037bd48ac98e1840691e8cc0127d9457a82667a0131e4826cb4e9d0a13f127afc62da4eb68af5a3e +dSFMT.v2.2.5+0.i686-linux-musl.tar.gz/md5/a61405f72c9a3bba5718f078c68e61a5 +dSFMT.v2.2.5+0.i686-linux-musl.tar.gz/sha512/726f130bbbfd0dece4185b89a25a73f3b5b950ebfb7f86aea6e9cbcf9ae932e591d20b854de0b4985103dbf8b4b7cb3560661c5070af971cd2c1f3ec3e1ea7d2 +dSFMT.v2.2.5+0.i686-w64-mingw32.tar.gz/md5/3bc27ef8f26c7a26f096cf1d558d408d +dSFMT.v2.2.5+0.i686-w64-mingw32.tar.gz/sha512/ea3608d3ae3874ea57a1a08f69abe2a1638bc340db71c6fe3c4fd5637d8c54943bf16b099a46817387c1ed4cb5f3cd1c0ff19ae8a4ed85dd555555821af06374 +dSFMT.v2.2.5+0.powerpc64le-linux-gnu.tar.gz/md5/fd8c73961ef7c82201e6d86e8bf4324c +dSFMT.v2.2.5+0.powerpc64le-linux-gnu.tar.gz/sha512/1bd0ebd019cfc6f25f7ba007547c5ee297854655b93c55e90d8ead420875de5a087e38956693d5e901ff2abf667c72aa66fb34f587b82adf4b91b3d5d666b5c7 +dSFMT.v2.2.5+0.x86_64-apple-darwin.tar.gz/md5/6be9f2d3cd8d45a3fc1c3feeebcbbf00 +dSFMT.v2.2.5+0.x86_64-apple-darwin.tar.gz/sha512/5d17c2c0eedad6739b41b8a613e9e452df484136ecd11aed1f6b9f426ae1deaef9faf721810080ebc1701a88a3e7ae91b1992893598c33b342c3f876661f2f8e +dSFMT.v2.2.5+0.x86_64-linux-gnu.tar.gz/md5/fa671f4ca14b171d53c8866d03f9162a +dSFMT.v2.2.5+0.x86_64-linux-gnu.tar.gz/sha512/2e242a1448da0508ea88cc1a106f1e74f8d7e7562cd82b80d86abf9a8b454653ad7612e25c30ce00c23757e8a5b7b5736253b00a52f9473af6c5d4df768138f2 +dSFMT.v2.2.5+0.x86_64-linux-musl.tar.gz/md5/c648294163882ec539ab646542c74880 +dSFMT.v2.2.5+0.x86_64-linux-musl.tar.gz/sha512/9e96a47d660854b6517364f0db40a2f4e0e3b814499a0349f7cf550b1c8d04589fca5eb4a75bf34f36d1b5d1b2277b3e9a961c887092abedd08f438e025329e7 +dSFMT.v2.2.5+0.x86_64-unknown-freebsd.tar.gz/md5/5b53e6c5b78102f563742b4b3d888ec6 +dSFMT.v2.2.5+0.x86_64-unknown-freebsd.tar.gz/sha512/5db5902c7ec2624add768b9e2866f9aac224a31bcb4114d450c45717e2b244521b7c511c059527d557a71639cff98e190a38cd3e28db5be0b1faf0a1762cb1a5 +dSFMT.v2.2.5+0.x86_64-w64-mingw32.tar.gz/md5/386adb3b7593c222dc7a1060a1356b21 +dSFMT.v2.2.5+0.x86_64-w64-mingw32.tar.gz/sha512/fe2ab5021126807b37042e89a22ef9a869c6a0a028680df445773b2affd11c2b02148be07d53504ea3842bb38bb62fe039529688266c1cba3545a892bd4dc185 +dsfmt-2.2.5.tar.gz/md5/d22e476b52cdee7d5b90d2f289570073 +dsfmt-2.2.5.tar.gz/sha512/951e8669350f750b8915a819e704eae0a9b9c9518b3e3b9a1905f9ca0d25cc4c2486cb479e258a4a114e9c26ceb73a6c4e9f1cc02ed19173aeb8f20189754f6b diff --git a/deps/dsfmt.version b/deps/dsfmt.version index bbb63417f46cd..d81db2d10ff09 100644 --- a/deps/dsfmt.version +++ b/deps/dsfmt.version @@ -1,5 +1,7 @@ +# -*- makefile -*- + ## jll artifact DSFMT_JLL_NAME := dSFMT ## source build -DSFMT_VER := 2.2.4 +DSFMT_VER := 2.2.5 diff --git a/stdlib/dSFMT_jll/Project.toml b/stdlib/dSFMT_jll/Project.toml index a83775f625987..0db19e602f67b 100644 --- a/stdlib/dSFMT_jll/Project.toml +++ b/stdlib/dSFMT_jll/Project.toml @@ -1,6 +1,6 @@ name = "dSFMT_jll" uuid = "05ff407c-b0c1-5878-9df8-858cc2e60c36" -version = "2.2.4+4" +version = "2.2.5+0" [deps] Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" From 9deee4619209c997077e843b7549abc292884819 Mon Sep 17 00:00:00 2001 From: djturizo <72823727+djturizo@users.noreply.github.com> Date: Sat, 30 Dec 2023 05:58:31 -0500 Subject: [PATCH 10/13] Bunch-Kaufman factorization support for generic number types and inertia computations (#51487) ### Introduction This PR adds a generic implementation of the Bunch-Kaufman factorization in native Julia code, and a generic implementation of a inertia calculation function. Right now Julia only support the Bunch-Kaufman factorization for `Float32`, `Float64` and their complex variants. This is because the factorization is handled by LAPACK, which only supports these types. To extend support to generic number types, I translated the LAPACK implementation to native Julia code, and the code performs the factorization in-place. I also included the function `inertia` that computes the number of positive, negative, and zero eigenvalues of an $n \times n$ Bunch-Kaufman factorized matrix in $\mathcal{O}(n)$ time. ### Changes - `bunchkaufman` and `bunchkaufman!` now work for any `AbstractFloat`, `Rational` and their complex variants. Behavior for previously supported types is not changed (LAPACK is used when possible). `bunchakaufman!` does not support `Integer` types, as in general the factorization lies in the arithmetic closure of the number type (the rationals for the integers). On the other hand, `bunchakaufman` supports `Integer` types, by making an internal conversion to `Rational{BigInt}`. - `ldiv!` for a `BunchKaufman` factorization has extended support for generic number types with type stability. - Previously, Julia extracted the diagonal factor of an $n \times n$ `BunchKaufman` object by making a copy of the matrix and then calling a LAPACK function (`dsyconvf`, `csyconvf`, etc., depending on the number type). This function also computes the triangular factor, so it runs in $\mathcal{O}(n^2)$ time. Now Julia uses a native implementation of the LAPACK function with small modifications, so it computes the diagonal factor in $\mathcal{O}(n)$ time, without making a new copy of the matrix. - Added the function `inertia` that computes the number of positive, negative and zero eigenvalues of the diagonal factor of an $n \times n$ `BunchKaufman` object, in case that the matrix is real symmetric or Hermitian. For complex symmetric matrices, `inertia` only computes the number of zero eigenvalues of the diagonal factor. `inertia` runs in $\mathcal{O}(n)$ time and only uses arithmetic and real absolute value operations. Therefore, `inertia` can be used for matrices of any generic number type, including `Rational`. In particular, for rational matrices the output of `inertia` is exact (unless a positive tolerance is specified). - Unit tests of the `BunchKaufman` library has been adapted to handle low precision number types (approximate comparisons with tolerance `sqrt(eps(Float64))` do not make sense when the floating point type is `Float16`, for example). The test-set now also runs on the following types: `Float16, Complex{Float16}, BigFloat, Complex{BigFloat}, Complex{Int}, BigInt, Complex{BigInt}, Rational{BigInt}, Complex{Rational{BigInt}}`. Unit tests for the `inertia` function have been added too. --- NEWS.md | 1 + stdlib/LinearAlgebra/src/bunchkaufman.jl | 1208 ++++++++++++++++++++- stdlib/LinearAlgebra/test/bunchkaufman.jl | 81 +- 3 files changed, 1263 insertions(+), 27 deletions(-) diff --git a/NEWS.md b/NEWS.md index d5ef405b7a913..82ae28a3b420d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -115,6 +115,7 @@ Standard library changes respectively, now efficiently compute the generalized eigenvalues (`eigen`: and eigenvectors) of `A` and `B`. Note: The second argument is the output of `bunchkaufman` or `lu` ([#50471]). * Structured matrices now retain either the axes of the parent (for `Symmetric`/`Hermitian`/`AbstractTriangular`/`UpperHessenberg`), or that of the principal diagonal (for banded matrices) ([#52480]). +* `bunchkaufman` and `bunchkaufman!` now work for any `AbstractFloat`, `Rational` and their complex variants. `bunchkaufman` now supports `Integer` types, by making an internal conversion to `Rational{BigInt}`. Added new function `inertia` that computes the inertia of the diagonal factor given by the `BunchKaufman` factorization object of a real symmetric or Hermitian matrix. For complex symmetric matrices, `inertia` only computes the number of zero eigenvalues of the diagonal factor ([#51487]). #### Printf diff --git a/stdlib/LinearAlgebra/src/bunchkaufman.jl b/stdlib/LinearAlgebra/src/bunchkaufman.jl index f995b3b8444c4..19f82167495f3 100644 --- a/stdlib/LinearAlgebra/src/bunchkaufman.jl +++ b/stdlib/LinearAlgebra/src/bunchkaufman.jl @@ -4,6 +4,15 @@ ## LD for BunchKaufman, UL for CholeskyDense, LU for LUDense and ## define size methods for Factorization types using it. +##----------- Type utilities for generic Bunch-Kaufman implementation ------------ +# Generic real type. Any real number type should able to approximate +# real numbers, and thus be closed under arithmetic operations. +# Therefore so Int, Complex{Int}, etc. are excluded. +ClosedReal = T where T <: Union{AbstractFloat, Rational} +# Similarly, we also use a closed scalar type +ClosedScalar = Union{T, Complex{T}} where T <: ClosedReal +##-------------------------------------------------------------------------------- + """ BunchKaufman <: Factorization @@ -22,10 +31,10 @@ as appropriate given `S.uplo`, and `S.p`. # Examples ```jldoctest -julia> A = [1 2; 2 3] -2×2 Matrix{Int64}: - 1 2 - 2 3 +julia> A = Float64.([1 2; 2 3]) +2×2 Matrix{Float64}: + 1.0 2.0 + 2.0 3.0 julia> S = bunchkaufman(A) # A gets wrapped internally by Symmetric(A) BunchKaufman{Float64, Matrix{Float64}, Vector{Int64}} @@ -145,10 +154,10 @@ The following functions are available for `BunchKaufman` objects: # Examples ```jldoctest -julia> A = [1 2; 2 3] -2×2 Matrix{Int64}: - 1 2 - 2 3 +julia> A = Float64.([1 2; 2 3]) +2×2 Matrix{Float64}: + 1.0 2.0 + 2.0 3.0 julia> S = bunchkaufman(A) # A gets wrapped internally by Symmetric(A) BunchKaufman{Float64, Matrix{Float64}, Vector{Int64}} @@ -237,37 +246,47 @@ function _ipiv2perm_bk(v::AbstractVector{T}, maxi::Integer, uplo::AbstractChar, return p end -function getproperty(B::BunchKaufman{T,<:StridedMatrix}, d::Symbol) where {T<:BlasFloat} +function getproperty(B::BunchKaufman{TS}, + d::Symbol) where TS <: ClosedScalar{TR} where TR <: ClosedReal n = size(B, 1) if d === :p return _ipiv2perm_bk(getfield(B, :ipiv), n, getfield(B, :uplo), B.rook) elseif d === :P - return Matrix{T}(I, n, n)[:,invperm(B.p)] + return Matrix{TS}(I, n, n)[:,invperm(B.p)] elseif d === :L || d === :U || d === :D - if getfield(B, :rook) - LUD, od = LAPACK.syconvf_rook!(getfield(B, :uplo), 'C', copy(getfield(B, :LD)), getfield(B, :ipiv)) + if d === :D + _, od, md = generic_syconv(B, false) + elseif typeof(B) <: BunchKaufman{T,<:StridedMatrix} where {T<:BlasFloat} + # We use LAPACK whenever we can + if getfield(B, :rook) + LUD, _ = LAPACK.syconvf_rook!(getfield(B, :uplo), 'C', + copy(getfield(B, :LD)), getfield(B, :ipiv)) + else + LUD, _ = LAPACK.syconv!(getfield(B, :uplo), copy(getfield(B, :LD)), + getfield(B, :ipiv)) + end else - LUD, od = LAPACK.syconv!(getfield(B, :uplo), copy(getfield(B, :LD)), getfield(B, :ipiv)) + LUD, _ = generic_syconv(B) end if d === :D if getfield(B, :uplo) == 'L' odl = od[1:n - 1] - return Tridiagonal(odl, diag(LUD), getfield(B, :symmetric) ? odl : conj.(odl)) + return Tridiagonal(odl, md, getfield(B, :symmetric) ? odl : conj.(odl)) else # 'U' odu = od[2:n] - return Tridiagonal(getfield(B, :symmetric) ? odu : conj.(odu), diag(LUD), odu) + return Tridiagonal(getfield(B, :symmetric) ? odu : conj.(odu), md, odu) end elseif d === :L if getfield(B, :uplo) == 'L' return UnitLowerTriangular(LUD) else - throw(ArgumentError("factorization is U*D*transpose(U) but you requested L")) + throw(ArgumentError("factorization is U*D*U' but you requested L")) end else # :U if B.uplo == 'U' return UnitUpperTriangular(LUD) else - throw(ArgumentError("factorization is L*D*transpose(L) but you requested U")) + throw(ArgumentError("factorization is L*D*L' but you requested U")) end end else @@ -411,3 +430,1158 @@ end ## reconstruct the original matrix ## TODO: understand the procedure described at ## http://www.nag.com/numeric/FL/nagdoc_fl22/pdf/F07/f07mdf.pdf + + +##-------------------------------------------------------------------------- +##------------- Start of generic Bunch-Kaufman Implementation -------------- +##-------------------------------------------------------------------------- + +export inertia + +function arg_illegal(fun_name::AbstractString, + info::Integer, + waer::AbstractChar) + if waer == 'W' + @warn " ** On entry to '$(fun_name)' parameter number " * + "$(info) had an illegal value" + else + error(" ** On entry to '$(fun_name)' parameter number " * + "$(info) had an illegal value") + end +end + + +function cabs1(z::T) where T <: Complex + return abs(real(z)) + abs(imag(z)) +end + + +function cabsr(z::T) where T <: Complex + return abs(real(z)) +end + + +""" +generic_adr1!(uplo, alpha, x, y, A, syhe) -> nothing + +`generic_adr1!` performs the following adjoint (symmetric or Hermitian) +rank 1 operation + +`A[1:K,1:L] = alpha*x*y' + A[1:K,1:L]` + +in-place, where `alpha` is a scalar, `x` is a K element vector, `y` +is an L element vector and `A` is an `NxM` matrix. Note that `y'` can +denote either the transpose, i.e. `transpose(y)` or the conjugate +transpose , i.e. `adjoint(y)`. + +`uplo` is a character, either `'U'`, `'L'` or `'F'`, indicating whether +the matrix is stored in the upper triangular part (`uplo=='U'`), the +lower triangular part (`uplo=='L'`), or the full storage space is used +(`uplo=='F'`). If `uplo!='F'` then only the corresponding triangular +part is updated. The values `'U'` or `'L'` can only be used when A is +square (`N==M`). + +`syhe` is a character, either `'S'` or `'H'`, indicating whether the +symmetric adjoint (`syhe=='S'`, and `y'==transpose(y)`) or the hermitian +adjoint (`syhe=='H'`, and `y'==adjoint(y)`) must be used. +""" +function generic_adr1!(uplo::AbstractChar, + alpha::ClosedScalar{TR}, + x::AbstractVector{TS}, + y::AbstractVector{TS}, + A::AbstractMatrix{TS}, + syhe::AbstractChar + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Inputs must be 1-indexed; bounds may not be checked. + Base.require_one_based_indexing(x, A) + + # Check argument validity + K = length(x) + L = length(y) + N, M = size(A) + info = 0::BlasInt + if (uplo != 'U' && uplo != 'L' && uplo != 'F') || (uplo != 'F' && N != M) + info = (-1)::BlasInt + elseif K > N + info = (-3)::BlasInt + elseif L > M + info = (-4)::BlasInt + elseif syhe != 'S' && syhe != 'H' + info = (-6)::BlasInt + end + if info < 0 + arg_illegal("generic_sadr1!", -info, 'E') + end + + # Load the requested adjoining operator + adj_op = syhe == 'S' ? identity : conj + + # Define loop range function according to the type of storage + # TODO: can we adjust the range without anonymous functions, + # but without having to write the same code thrice? + i_range = uplo == 'F' ? _ -> (1:K) : uplo == 'U' ? j -> (1:min(j,K)) : j -> (j:K) + + # Compute rank update of A + for j in 1:L; @inbounds begin + if y[j] != 0 + temp = alpha * adj_op(y[j]) + for i in i_range(j) + A[i,j] += x[i] * temp + end + end + end; end + return +end + + +""" +generic_mvpv!(trans, alpha, A, x, beta, y) -> nothing + +`generic_mvpv!` performs the following matrix-vector operation: + +`y[1:K] = alpha*A'*x[1:L] + beta*y[1:K]` + +in-place, where `alpha` and `beta` are scalars, `x` is a vector with at +least L elements, `y` is a vector with at least K elements, and `A` is +an `NxM` matrix. `A'` can denote the transpose, i.e. `transpose(A)` or +the conjugate transpose, i.e. `adjoint(A)`, and then `M==K && N==L`. +`A'` can also denote no adjoining at all, i.e. `A'==A`, and then +`N==K && M==L`. + +`trans` is a character, either `'T'`, `'C'` or `'N'`, indicating whether +`A'=transpose(A)`, `A'=adjoint(A)` or `A'=A`, respectively. +""" +function generic_mvpv!(trans::AbstractChar, + alpha::ClosedScalar{TR}, + A::AbstractMatrix{TS}, + x::AbstractVector{TS}, + beta::ClosedScalar{TR}, + y::AbstractVector{TS}, + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Inputs must be 1-indexed; bounds may not be checked. + Base.require_one_based_indexing(A, x, y) + + # Check argument validity + M, N = size(A) + K = trans == 'N' ? M : N + L = trans == 'N' ? N : M + info = 0::BlasInt + if trans != 'T' && trans != 'C' && trans != 'N' + info = (-1)::BlasInt + elseif length(y) < K + info = (-3)::BlasInt + elseif length(x) < L + info = (-4)::BlasInt + end + if info < 0 + arg_illegal("generic_sadr1!", -info, 'E') + end + + # Quick return if possible. + if K == 0 || (alpha == 0 && beta == 1); return; end + + # Start the operations. In this version the elements of A are + # accessed sequentially with one pass through A. + # First form y := beta*y. + @inbounds begin + if beta != 1 + if beta == 0 + # Way less allocations and way faster for BigFloat. + # For Float64 there is some (acceptable IMO) performance loss. + y[1:K] .= 0 + else + for i in 1:K; y[i] *= beta; end + end + end + if alpha == 0 || L == 0; return; end + + if trans == 'N' + # Form y := alpha*A*x + y. + for j in 1:L + # Faster than a loop + axpy!(alpha*x[j], view(A, 1:K, j), view(y, 1:K)) + end + else + # Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. + noconj = (trans == 'T') + for i = 1:K + temp = 0 + if noconj + for j = 1:L + temp = temp + A[j,i]*x[j] + end + else + for j = 1:L + temp = temp + conj(A[j,i])*x[j] + end + end + y[i] += alpha*temp + end + end + end + return +end + + +""" +bk_rowcol_swap!(A, k, kp, kstep, upper, herm) -> did_swap::Bool + +Performs the row and column interchange of the Bunch-Kaufman factorization. +If `upper==true` then the rows and columns `kp` of `A[1:k,1:k]` are +interchanged with either rows and columns `k` or `k-1` of `A[1:k,1:k]`, +depending on whether `kstep==1` or `kstep==2`, respectively. If +`upper==false` then the rows and columns `kp-k+1` of `A[k:N,k:N]` are +interchanged with either rows and columns `1` or `2` of `A[k:N,k:N]`, +depending on whether `kstep==1` or `kstep==2`, respectively. `herm=true` +then it is assumed that `A` is Hermitian, and conjugation is applied to +the appropriate entries of the interchanged rows and columns. If +`herm=false` no conjugation is performed. + +This is an internal helper function for the main Bunch-Kaufman +factorization function, `generic_bunchkaufman!`. As such, validity of the +input values is not verified. +""" +function bk_rowcol_swap!( + A::AbstractMatrix{TS}, + k::Integer, + kp::Integer, + kstep::Integer, + upper::Bool, + herm::Bool + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + kk = upper ? k - kstep + 1 : k + kstep - 1 + if kp != kk + if kp > 1 + thisview = upper ? view(A, 1:(kp-1), :) : view(A, (kp+1):size(A,1), :) + Base.swapcols!(thisview, kp, kk) + end + thisrange = upper ? ((kp+1):(kk-1)) : ((kk+1):(kp-1)) + if !herm + # Real/complex symmetric case + for j in thisrange + A[j,kk], A[kp,j] = A[kp,j], A[j,kk] + end + A[kk,kk], A[kp,kp] = A[kp,kp], A[kk,kk] + else + # Hermitian case + for j in thisrange + A[j,kk], A[kp,j] = conj(A[kp,j]), conj(A[j,kk]) + end + A[kp,kk] = conj(A[kp,kk]) + A[kk,kk], A[kp,kp] = real(A[kp,kp]), real(A[kk,kk]) + end + if kstep == 2 + if herm + # Force diagonal entry to be purely real + A[k,k] = real(A[k,k]) + end + if upper + A[k-1,k], A[kp,k] = A[kp,k], A[k-1,k] + else + A[k+1,k], A[kp,k] = A[kp,k], A[k+1,k] + end + end + return true + else + return false + end +end + + +""" +generic_bunchkaufman!(uplo, A, syhe, rook::Bool=false) -> +LD<:AbstractMatrix, ipiv<:AbstractVector{Integer}, info::BlasInt + +Computes the Bunch-Kaufman factorization of a symmetric or Hermitian +matrix `A` of size `NxN` as `P'*U*D*U'*P` or `P'*L*D*L'*P`, depending on +which triangle is stored in `A`. Note that if `A` is complex symmetric +then `U'` and `L'` denote the unconjugated transposes, i.e. +`transpose(U)` and `transpose(L)`. The resulting `U` or `L` and D are +stored in-place in `A`, LAPACK style. `LD` is just a reference to `A` +(that is, `LD===A`). `ipiv` stores the permutation information of the +algorithm in LAPACK format. `info` indicates whether the factorization +was successful and non-singular when `info==0`, or else `info` takes a +different value. The outputs `LD`, `ipiv`, `info` follow the format of +the LAPACK functions of the Bunch-Kaufman factorization (`dsytrf`, +`csytrf`, `chetrf`, etc.), so this function can (ideally) be used +interchangeably with its LAPACK counterparts `LAPACK.sytrf!`, +`LAPACK.sytrf_rook!`, etc. + +`uplo` is a character, either `'U'` or `'L'`, indicating whether the +matrix is stored in the upper triangular part (`uplo=='U'`) or in the +lower triangular part (`uplo=='L'`). + +`syhe` is a character, either `'S'` or `'H'`, indicating whether the +matrix is real/complex symmetric (`syhe=='S'`, and the symmetric +Bunch-Kaufman factorization is performed) or complex hermitian +(`syhe=='H'`, and the hermitian Bunch-Kaufman factorization is +performed). + +If `rook` is `true`, rook pivoting is used (also called bounded +Bunch-Kaufman factorization). If `rook` is `false`, rook pivoting is +not used (standard Bunch-Kaufman factorization). Rook pivoting can +require up to `~N^3/6` extra comparisons in addition to the `~N^3/3` +additions and `~N^3/3` multiplications of the standard Bunch-Kaufman +factorization. However, rook pivoting guarantees that the entries of +`U` or `L` are bounded. + +This function implements the factorization algorithm entirely in +native Julia, so it supports any number type representing real or +complex numbers. +""" +function generic_bunchkaufman!( + uplo::AbstractChar, + A::AbstractMatrix{TS}, + syhe::AbstractChar, + rook::Bool=false + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Inputs must be 1-indexed; bounds may not be checked. + Base.require_one_based_indexing(A) + + # Initialize info integer as 0 + info = 0::BlasInt + # Get size of matrix + N, N2 = size(A) + # Initialize permutation vector + ipiv = Vector{BlasInt}(undef, N) + + # Check input correctness + if uplo != 'U' && uplo != 'L' + info = (-1)::BlasInt + elseif N != N2 + info = (-2)::BlasInt + elseif syhe != 'S' && syhe != 'H' + info = (-3)::BlasInt + end + if info < 0 + arg_illegal("generic_bunchkaufman!", -info, 'W') + return A, ipiv, info + end + # if rook + # error("Rook pivoting not implemented yet.") + # end + + # Initialize `alpha` for use in choosing pivot block size. + # The exact value is + # (1 + sqrt(17)) / 8 ~= 0.6404 + # For rational matrices we a the small denominator approximation: + # 16/25 = 0.64 ~= (1 + sqrt(17)) / 8 + # in order to not increase the denominator size too much in computations. + # The error of this approximation is ≤0.1%, and it still guarantees that a + # 2x2 block in the D factor has a positive-negative eigenvalue pair, as long + # as the approximation lies in (0,1). + alpha = TR <: AbstractFloat ? (1 + sqrt(TR(17))) / 8 : TR(16//25) + # Use complex 1-norm for pivot selection, as in LAPACK + abs1_fun = TS <: Real ? abs : cabs1 + + # Check if the matrix is symmetric of hermitian + if syhe == 'S' || (syhe == 'H' && TS <: Real) + # Use symmetric variant if matrix is real, regardless of 'syhe' value + syhe = 'S' + diag_abs_fun = abs1_fun + else + diag_abs_fun = cabsr + end + + # Compute machine safe minimum when working with floating point numbers. + # LAPACK doesn't use this for diagonal pivoting though... + if rook + if TR <: AbstractFloat + # eps(0) gives the smallest subnormal number, and eps(1) gives the floating + # point type epsilon. eps(0)/eps(1) gives the smallest normal number, plus + # possibly some rounding error. + sfmin = nextfloat(eps(TR(0)) / eps(TR(1)), 2) + small = 1 / prevfloat(typemax(TR), 2) + if small >= sfmin + # 1/sfmin may overflow, so use 'small' plus a bit as the safe minimum + sfmin = nextfloat(small * (1 + eps(TR(1))), 2) + end + else + # We're working with rationals in this case, so the all results are exact. + sfmin = TR(0) + end + end + + # Run factorization depending on where the data is stored + upper = (uplo == 'U') + herm = (syhe == 'H') + # TODO: Is this gonna inline properly? + @inline k_cond = upper ? k -> k >= 1 : k -> k <= N + @inline irange = upper ? j -> (j:-1:1) : j -> (j:N) + @inline conj_op = herm ? conj : identity + @inline diagreal_op = herm ? (j -> A[j,j] = TS(real(A[j,j]))) : _ -> () + k = upper ? N : 1 + # Main loop, comments refer to the upper triangular version of the factorization. + # The lower triangular version is analogous. + while k_cond(k); @inbounds begin + kstep = 1 + knext = upper ? k - 1 : k + 1 + p = k + # Determine rows and columns to be interchanged and whether + # a 1-by-1 or 2-by-2 pivot block will be used + absakk = diag_abs_fun(A[k,k]) + # IMAX is the row-index of the largest off-diagonal element in + # column K, and COLMAX is its absolute value. + # Determine both COLMAX and IMAX. + if upper && k > 1 + colmax, imax = findmax(abs1_fun, view(A, 1:(k-1), k)) + elseif (!upper) && k < N + colmax, imax = findmax(abs1_fun, view(A, (k+1):N, k)) + imax += k + else + colmax = 0 + end + if (max(absakk, colmax) == 0) || isnan(absakk) + # Column K is zero or underflow, or contains a NaN: + # set INFO and continue + if info == 0 + info = k::BlasInt + end + kp = k + if herm + # Force diagonal entry to be purely real + A[k,k] = real(A[k,k]) + end + else + if absakk >= alpha*colmax + # no interchange, use 1-by-1 pivot block + kp = k + elseif rook + # Loop until pivot found + while true + # Begin pivot search loop body + # JMAX is the column-index of the largest off-diagonal + # element in row IMAX, and ROWMAX is its absolute value. + # Determine both ROWMAX and JMAX. + if imax != k + thisview = upper ? view(A, imax, (imax+1):k) : + view(A, imax, k:(imax-1)) + rowmax, jmax = findmax(abs1_fun, thisview) + jmax += upper ? imax : k - 1 + else + # LAPACK makes rowmax=0 in this case, but I believe it's + # better to make rowmax=-1, so that we guarantee that jmax + # will be define in the next if-block. + # TODO: is this correct/safe? + rowmax = 0 + end + if (upper && imax > 1) || ((!upper) && imax < N) + # Remember that we only have the upper triangular part + # of the matrix. We inspect the part of the row in the + # lower triangular part by traversing the corresponding + # part of the transpose column. + if upper + stemp, itemp = findmax(abs1_fun, view(A, 1:(imax-1), imax)) + else + stemp, itemp = findmax(abs1_fun, view(A, (imax+1):N, imax)) + itemp += imax + end + if stemp > rowmax + rowmax = stemp + jmax = itemp + end + end + # Equivalent to testing for (used to handle NaN and Inf) + # CABS1( A( IMAX, IMAX ) ).GE.ALPHA*ROWMAX + if !(diag_abs_fun(A[imax,imax]) < alpha*rowmax) + # interchange rows and columns K and IMAX, + # use 1-by-1 pivot block + kp = imax + break + # Equivalent to testing for ROWMAX .EQ. COLMAX, + # used to handle NaN and Inf + elseif (p == jmax || rowmax <= colmax) + # interchange rows and columns K+1 and IMAX, + # use 2-by-2 pivot block + kp = imax + kstep = 2 + break + else + # Pivot NOT found, set variables and repeat + p = imax + colmax = rowmax + imax = jmax + end + # End pivot search loop body + end + else + # JMAX is the column-index of the largest off-diagonal + # element in row IMAX, and ROWMAX is its absolute value + # We don't really need JMAX, se we don't store it + thisview = upper ? view(A, imax, (imax+1):k) : view(A, imax, k:(imax-1)) + rowmax = findmax(abs1_fun, thisview)[1] + if (upper && imax > 1) || ((!upper) && imax < N) + # Remember that we only have the upper triangular part + # of the matrix. We inspect the part of the row in the + # lower triangular part by traversing the corresponding + # part of the transpose column. + thisview = upper ? view(A, 1:(imax-1), imax) : + view(A, (imax+1):N, imax) + rowmax = max(rowmax, findmax(abs1_fun, thisview)[1]) + end + if absakk >= alpha * colmax * (colmax/rowmax) + # no interchange, use 1-by-1 pivot block + kp = k + elseif diag_abs_fun(A[imax,imax]) >= alpha * rowmax + # interchange rows and columns K and IMAX, use 1-by-1 + # pivot block + kp = imax + else + # interchange rows and columns K-1 and IMAX, use 2-by-2 + # pivot block + kp = imax + p = imax + kstep = 2 + end + end + # Swap TWO rows and TWO columns + # First swap + # The first swap only needs to be done when using rook pivoting + if rook && kstep == 2 + # Interchange rows and column K and P in the leading + # submatrix A(1:k,1:k) if we have a 2-by-2 pivot + bk_rowcol_swap!(A, k, p, 1, upper, herm) + end + # Second swap + did_swap = bk_rowcol_swap!(A, k, kp, kstep, upper, herm) + if herm && (!did_swap) + # Force diagonal entries to be purely real + A[k,k] = real(A[k,k]) + if kstep == 2 + A[knext,knext] = real(A[knext,knext]) + end + end + if kstep == 1 + # 1-by-1 pivot block D(k): column k now holds + # W(k) = U(k)*D(k) + # where U(k) is the k-th column of U + # When rook=false, sfmin is not defined, but the short-circuit + # evaluation of the conditional avoids an error. + if (!rook) || absakk >= sfmin + # Perform a rank-1 update of A(1:k-1,1:k-1) as + # A := A - U(k)*D(k)*U(k)' = A - W(k)*1/D(k)*W(k)' + # Compute 1/D(k) + r1 = !herm ? 1 / A[k,k] : 1 / real(A[k,k]) + # Perform rank-1 update to store the Schur complement + # in a submatrix of A + x = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k) + # if 'upper' this should assign by reference + thisview = upper ? A : view(A, (k+1):N, (k+1):N) + generic_adr1!(uplo, -r1, x, x, thisview, syhe) + # Store U(k) in column k + thisrange = upper ? (1:(k-1)) : ((k+1):N) + for i in thisrange + A[i,k] *= r1 + end + else + # Compute D(k) + r1 = !herm ? A[k,k] : real(A[k,k]) + # Store U(k) in column k + thisrange = upper ? (1:(k-1)) : ((k+1):N) + for i in thisrange + A[i,k] /= r1 + end + # Perform a rank-1 update of A(k+1:n,k+1:n) as + # A := A - U(k)*D(k)*U(k)**T + # = A - W(k)*(1/D(k))*W(k)**T + # = A - (W(k)/D(k))*(D(k))*(W(k)/D(K))**T + # Perform rank-1 update to store the Schur complement + # in a submatrix of A + x = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k) + # if 'upper' this should assign by reference + thisview = upper ? A : view(A, (k+1):N, (k+1):N) + generic_adr1!(uplo, -r1, x, x, thisview, syhe) + end + elseif (upper && k > 2) || ((!upper) && k < N - 1) + # 2-by-2 pivot block D(k): columns k and k-1 now hold + # ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k) + # where U(k) and U(k-1) are the k-th and (k-1)-th columns + # of U + # Perform a rank-2 update of A(1:k-2,1:k-2) as + # A := A - ( U(k-1) U(k) )*D(k)*( U(k-1) U(k) )' + # = A - ( W(k-1) W(k) )*inv(D(k))*( W(k-1) W(k) )' + thisrange = upper ? ((k-2):-1:1) : ((k+2):N) + if !herm + # Real/complex symmetric case + #TODO: is this way to compute the inverse backward stable? + # (it probably is as it comes from LAPACK) + dxk = A[knext,k] + dxx = A[knext,knext] / dxk + dkk = A[k,k] / dxk + t = 1 / (dkk * dxx - 1) + dxk = t / dxk + dkx = dxk + else + # Hermitian case + # TODO: is this way to compute the inverse backward stable? + # (it probably is as it is a small modification of LAPACK's + # method) + dxk = A[knext,k] + dxx = real(A[knext,knext]) / dxk + dkk = real(A[k,k]) / conj(dxk) + t = 1 / (real(dkk * dxx) - 1) + dkx = t / conj(dxk) + dxk = t / dxk + end + for j in thisrange + wknext = dxk * (dkk*A[j,knext] - A[j,k]) + wk = dkx * (dxx*A[j,k] - A[j,knext]) + for i in irange(j) + A[i,j] -= (A[i,k]*conj_op(wk) + A[i,knext]*conj_op(wknext)) + end + A[j,k] = wk + A[j,knext] = wknext + # Force diagonal entry to be purely real, but still of + # complex type TS (I don't know why in LAPACK this + # case, unlike the rest, enforces a complex type + # explicitly). + diagreal_op(j) + end + end + end + # Store details of the interchanges in IPIV + if kstep == 1 + ipiv[k] = kp + else + ipiv[k] = -p + ipiv[knext] = -kp + end + # Decrease K and return to the start of the main loop + # k -= upper ? kstep : -kstep + if upper; k -= kstep; else; k += kstep; end + end; end + return A, ipiv, info +end + + +""" +generic_syconv(F, gettri::Bool=true) -> +(TLU<:Union{AbstractMatrix,Nothing}, e<:AbstractVector, + d<:Union{AbstractVector,Nothing}) + +`generic_syconv` takes the Bunch-Kaufman object `F` and returns the +block-diagonal factor `D`, and the triangular factor `L` (or `U`) if +requested. If the `L` or `U` factor is requested then both `L` (or `U`) and +the main diagonal of `D` will be stored in `TLU`, following LAPACK format, +and `d` will be set to `nothing`. `e` contains the first subdiagonal of +`D`. If the triangular factor is not requested, then `TLU` will not be set +to `nothing`, and the main diagonal of `D` will be stored in `d`. + +`gettri` is a `Bool`, indicating whether the `L` (or `U`) triangular factor +should be computed (`gettri==true`) or not (`gettri==false`). If the +triangular factor is required, a copy of `A.LD` will be created, and the +triangular factor will be computed in-place in said copy. +""" +function generic_syconv( + F::BunchKaufman{TS}, + gettri::Bool=true + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Inputs must be 1-indexed; bounds may not be checked. + Base.require_one_based_indexing(F.LD, F.ipiv) + + # Extract necessary variables + A, ipiv, rook = gettri ? deepcopy(F.LD) : F.LD, F.ipiv, F.rook + + # Get size of matrix + N = size(A)[1] + + # Initialize off-diagonal and diagonal vector + e = Vector{TS}(undef, N) + d = gettri ? nothing : diag(A, 0) + + # Quick return if possible + if N == 0; return gettri ? A : nothing, e, d; end + + # Main loops + upper = (F.uplo == 'U') + @inline icond_d = upper ? i -> i > 1 : i -> i < N + @inline icond_T = upper ? i -> i >= 1 : i -> i <= N + @inline inext = upper ? i -> i - 1 : i -> i + 1 + # Convert VALUE + i = upper ? N : 1 + e[N+1-i] = 0 + while icond_d(i); @inbounds begin + if ipiv[i] < 0 + ix = inext(i) + e[i] = A[ix,i] + e[ix] = 0 + if gettri; A[ix,i] = 0; end + if upper; i -= 1; else; i += 1; end + else + e[i] = 0 + end + if upper; i -= 1; else; i += 1; end + end; end + # Convert PERMUTATIONS + if gettri + i = upper ? N : 1 + while icond_T(i); @inbounds begin + thisview = upper ? view(A, :, (i+1):N) : view(A, :, 1:(i-1)) + ip = ipiv[i] + if ip > 0 || rook + Base.swaprows!(thisview, abs(ip), i) + end + if ip <= 0 + ix = inext(i) + Base.swaprows!(thisview, -ipiv[ix], ix) + if upper; i -= 1; else; i += 1; end + end + if upper; i -= 1; else; i += 1; end + end; end + end + return gettri ? A : nothing, e, d +end + + +""" +generic_bksolve!(F, B) -> X<:AbstractVecOrMat + +`generic_bksolve!` solves a system of linear equations `A*X = B` where +the Bunch-Kaufman factorization of `A` is provided by `F`. +""" +function generic_bksolve!( + F::BunchKaufman{TS}, + B0::AbstractVecOrMat{TS}, + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Inputs must be 1-indexed; bounds may not be checked. + Base.require_one_based_indexing(F.LD, F.ipiv, B0) + + # Get size of matrices + N = size(F.LD)[1] + if typeof(B0) <: AbstractVector + N3 = size(B0)[1] + M = 1 + B = view(B0, :, :) + else + N3, M = size(B0) + B = B0 + end + + # Initialize info integer as 0 + info = 0::BlasInt + + # Check input correctness + if N3 != N + info = (-2)::BlasInt + end + if info < 0 + arg_illegal("generic_bksolve!", -info, 'E') + end + + # Quick return if possible + if N == 0 || M == 0; return B; end + + # Extract necessary variables + A, ipiv, symm, rook = F.LD, F.ipiv, issymmetric(F), F.rook + + # Load the requested adjoining operator + adj_op = symm ? identity : conj + + R1 = TR(1) + upper = (F.uplo == 'U') + @inline kcond1 = upper ? k -> k >= 1 : k -> k <= N + @inline kcond2 = upper ? k -> k <= N : k -> k >= 1 + @inline knext = upper ? k -> k - 1 : k -> k + 1 + @inline knext2 = upper ? k -> k + 1 : k -> k - 1 + k = upper ? N : 1 + while kcond1(k); @inbounds begin + kp = ipiv[k] + if kp > 0 + # 1 x 1 diagonal block + # Interchange rows K and IPIV(K). + Base.swaprows!(B, k, kp) + # Multiply by inv(U(K)), where U(K) is the transformation + # stored in column K of A. + Aview = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k) + Bview = upper ? B : view(B, (k+1):N, :) + generic_adr1!('F', -R1, Aview, view(B, k, :), Bview, 'S') + # Multiply by the inverse of the diagonal block. + s = symm ? 1 / A[k,k] : 1 / real(A[k,k]) + for j in 1:M; B[k,j] *= s; end + if upper; k -= 1; else; k += 1; end + else + # 2 x 2 diagonal block + # Interchange rows K and -IPIV(K) THEN K-1 and -IPIV(K-1) + # The first interchange is only needed when rook pivoting is used + if rook; Base.swaprows!(B, k, -kp); end + kx = knext(k) + Base.swaprows!(B, kx, -ipiv[kx]) + # Multiply by inv(U(K)), where U(K) is the transformation + # stored in columns K-1 and K of A. + Aview = upper ? view(A, 1:(k-2), k) : view(A, (k+2):N, k) + Bview = upper ? B : view(B, (k+2):N, :) + generic_adr1!('F', -R1, Aview, view(B, k, :), Bview, 'S') + Aview = upper ? view(A, 1:(k-2), kx) : view(A, (k+2):N, kx) + generic_adr1!('F', -R1, Aview, view(B, kx, :), Bview, 'S') + # Multiply by the inverse of the diagonal block. + axk = A[kx,k] + axx = A[kx,kx] / axk + akk = A[k,k] / adj_op(axk) + denom = axx*akk - 1 + for j in 1:M + bx = B[kx,j] / axk + bk = B[k,j] / adj_op(axk) + B[kx,j] = (akk*bx - bk) / denom + B[k,j] = (axx*bk - bx) / denom + end + if upper; k -= 2; else; k += 2; end + end + end; end + # Next solve U'*X = B, overwriting B with X. + # K is the main loop index, increasing from 1 to N in steps of + # 1 or 2, depending on the size of the diagonal blocks. + k = upper ? 1 : N + while kcond2(k); @inbounds begin + Aview = upper ? view(A, 1:(k-1), k) : view(A, (k+1):N, k) + Bview = upper ? view(B, 1:(k-1), :) : view(B, (k+1):N, :) + B_row = view(B, k, :) + kp = ipiv[k] + if kp > 0 + # 1 x 1 diagonal block + # Multiply by inv(U**T(K)), where U(K) is the transformation + # stored in column K of A. + if symm + generic_mvpv!('T', -R1, Bview, Aview, R1, B_row) + else + conj!(B_row) + generic_mvpv!('C', -R1, Bview, Aview, R1, B_row) + conj!(B_row) + end + # Interchange rows K and IPIV(K). + Base.swaprows!(B, k, kp) + if upper; k += 1; else; k -= 1; end + else + # 2 x 2 diagonal block + # Multiply by inv(U**T(K+1)), where U(K+1) is the transformation + # stored in columns K and K+1 of A. + kx = knext2(k) + if symm + generic_mvpv!('T', -R1, Bview, Aview, R1, B_row) + Aview = upper ? view(A, 1:(k-1), kx) : view(A, (k+1):N, kx) + B_row = view(B, kx, :) + generic_mvpv!('T', -R1, Bview, Aview, R1, B_row) + elseif k > 1 + conj!(B_row) + generic_mvpv!('C', -R1, Bview, Aview, R1, B_row) + conj!(B_row) + Aview = upper ? view(A, 1:(k-1), kx) : view(A, (k+1):N, kx) + B_row = view(B, kx, :) + conj!(B_row) + generic_mvpv!('C', -R1, Bview, Aview, R1, B_row) + conj!(B_row) + end + # Interchange rows K and -IPIV(K) THEN K+1 and -IPIV(K+1). + # The second interchange is only needed when rook pivoting is used + Base.swaprows!(B, k, -kp) + if rook; Base.swaprows!(B, kx, -ipiv[kx]); end + if upper; k += 2; else; k -= 2; end + end + end; end + return B +end + + +""" +inertia(B::BunchKaufman; atol::Real=0, rtol::Real=atol>0 ? 0 : n*ϵ) -> + np::Union{Nothing,Integer}, nn::Union{Nothing,Integer}, nz::Integer + +`inertia` computes the numerical inertia (the number of positive, +negative and zero eigenvalues, given by `np`, `nn` and `nz`, +respectively) of a real symmetric of Hermitian matrix `B` that has been +factored using the Bunch-Kaufman algorithm. For complex symmetric +matrices the inertia is not defined. in that case `np` and `nn` are set +to `nothing`, but the function still returns the number of zero +eigenvalues. The inertia is computed by counting the eigenvalues signs +of `B.D`. The number of zero eigenvalues is computed as the number of +estimated eigenvalues with complex 1-norm (defined as `|re(.)|+|im(.)|`) +less or equal than `max(atol, rtol*s₁)`, where `s₁` is an upper bound of +the largest singular value of `B.D`, `σ₁` (more specifically, +`0.5*s₁ <= σ₁ <= s₁` for real matrices and `0.35*s₁ <= σ₁ <= s₁` for +complex matrices). `atol` and `rtol` are the absolute and relative +tolerances, respectively. The default relative tolerance is `n*ϵ`, where +`n` is the size of of `A`, and `ϵ` is the [`eps`](@ref) of the number +type of `A`, if this type is a subtype of `AbstractFloat`. In any other +case (if the number type of `A` is `Rational`, for example) `ϵ` is set +to `0`. + +!!! note + Numerical inertia can be a sensitive and imprecise characterization of + ill-conditioned matrices with eigenvalues that are close in magnitude to the + threshold tolerance `max(atol, rtol*s₁)`. In such cases, slight perturbations + to the Bunch-Kaufman computation or to the matrix can change the result of + `rank` by pushing one or more eigenvalues across the threshold. These + variations can even occur due to changes in floating-point errors between + different Julia versions, architectures, compilers, or operating systems. + In particular, the size of the entries of the tringular factor directly + influende the scale of the eigenvalues of the diagonal factor, so it is + strongly recommended to use rook pivoting is the inertia is going to be + computed. + On the other hand, if the matrix has rational entries, the inertia + computation is guaranteed is to be exact, as long as there is no + under/overflow in the underlying integer type (and in such cases Julia itself + throws an error), or a positive tolerance (absolute or relative) is + specified. +""" +function inertia(B::BunchKaufman{TS}; + atol::TR = TR(0), + rtol::TR = TR(0) + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + + # Check if matrix is complex symmetric + get_inertia = !(issymmetric(B) && TS <: Complex) + + # Initialize outputs + np, nn, nz = get_inertia ? (0, 0, 0) : (nothing, nothing, 0) + + # Compute matrix size + N = size(B, 1) + + # Quick return if possible + if N == 0; return np, nn, nz; end + + # Compute default relative tolerance + if rtol <= 0 && atol <= 0 + rtol = TR <: AbstractFloat ? (N * eps(TR)) : TR(0) + end + + # We use the complex 1-norm for complex matrices + real_matrix = (TS <: Real) + abs1_fun = real_matrix ? abs : cabs1 + real_fun = real_matrix ? identity : real + + # Check if we must track the largest singular value + get_s1 = (rtol > 0) + + # Constant for lower bound estimation of the smallest eigenvalue in 2x2 blocks. + # The best (largest) value for complex matrices is 1/sqrt(2), but for rational + # matrices we use the small denominator approximation 12/17, in order to not + # increase the denominator size too much in computations. The error of this + # approximation is ≤0.2%, and we still get a valid lower bound. + c = real_matrix ? TR(1) : (TR <: AbstractFloat ? 1/sqrt(TR(2)) : TR(12//17)) + + # First pass, estimate largest singular value and group together size-1 blocks + D = B.D + s1 = TR(0) + i = 1 + while i <= N; @inbounds begin + if i < N && D[i,i+1] != 0 + # 2x2 block + # The largest singular value of a 2x2 matrix is between [1, 2] times + # its complex max-norm, which is between [c, 1] times the largest + # complex 1-norm among the entries of the 2x2 matrix. See "Roger + # Horn and Charles Johnson. Matrix Analysis, 2nd Edition, 5.6.P23". + abs_Dii = abs1_fun(D[i,i]) + abs_Dxx = abs1_fun(D[i+1,i+1]) + s1_block = 2 * max(abs_Dii, abs1_fun(D[i,i+1]), abs_Dxx) + if get_s1; s1 = max(s1, s1_block); end + # Lower bound on the smallest eigenvalue complex 2-norm is + # abs(λ₂) ≥ abs(det(block)) / s1_block + # so the bound in terms of the complex 1-norm becomes + # abs1_fun(λ₂) ≥ c * abs1_fun(det(block)) / s1_block + # For rational matrices, if λ₂=0 then det(block)=0 and then the bound + # becomes zero too. If λ₁=0 too then the block has all zero entries + # and 's1_block'=0, but 'D[i,i+1]' != 0 and so 's1_block' > 0. However, we + # may still have that 'smin_block'≈0, then the value of 'smin_block' may not + # be accurate. In that case the counting routine will detect that both + # eigenvalues are zero without using 'smin_block', so it doesn't matter. + # TODO: is this the most numerically stable way to compute the determinant? + # TODO: is this the best way to avoid under/overflow? + if abs_Dii >= abs_Dxx + smin_block = c * abs1_fun((D[i,i]/s1_block)*D[i+1,i+1] - + (D[i,i+1]/s1_block)*D[i+1,i]) + else + smin_block = c * abs1_fun(D[i,i]*(D[i+1,i+1]/s1_block) - + (D[i,i+1]/s1_block)*D[i+1,i]) + end + # Store lower bound in-place in the lower off-diagonal and upper bound + # in-place in the upper off-diagonal. The trace is stored in the first + # diagonal entry block, but only if the full inertia is needed. + D[i,i+1] = s1_block + D[i+1,i] = smin_block + if get_inertia; D[i,i] += D[i+1,i+1]; end + i += 2 + else + # 1x1 block + if get_s1; s1 = max(s1, abs1_fun(D[i,i])); end + i += 1 + end + end; end + + # Second pass, count eigenvalue signs + tol = max(atol, rtol * s1) + i = 1 + while i <= N; @inbounds begin + if i < N && D[i,i+1] != 0 + # 2x2 block. For the counting of zero eigenvalues we use the lower bound on the + # eigenvalues' magnitude. This way, if an eigenvalue is deemed non-zero, then + # it is guaranteed that its magnitude is greater than the tolerance. + s1_block = real_fun(D[i,i+1]) + if (c / 2) * s1_block <= tol + # Lower bound of largest eigenvalue is smaller than the tolerance, + # we consider the both eigenvalues of this block to be zero. + nz += 2 + i += 2 + continue + end + # Reaching this part of the lopp implies that 's1_block' != 0. + smin_block = real_fun(D[i+1,i]) + trace_block = real_fun(D[i,i]) + if smin_block > tol || trace_block == 0 + # If first condition holds then the lower bound of the smallest eigenvalue + # is larger than the tolerance. If the second condition holds then the trace + # is exactly zero, so both eigenvalues have the same magnitude, and we + # already know that the largest one is non-zero. In any case we conclude + # that both eigenvalues are non-zero. + if get_inertia + # The eigenvalues of a 2x2 block are guaranteed to be a + # positive-negative pair. + np += 1 + nn += 1 + end + else + # The lower bound of smallest eigenvalue is smaller than the tolerance and + # the trace is non-zero, so we consider the smallest eigenvalues of this + # block to be zero. + nz += 1 + if get_inertia + # The trace is non-zero, and its sign is the same of the largest + # eigenvalue. + if trace_block >= 0 + np += 1 + else + nn += 1 + end + end + end + i += 2 + else + # 1x1 block + if get_inertia + eig = real_fun(D[i,i]) + if eig > tol + np += 1 + elseif eig < -tol + nn += 1 + else + nz += 1 + end + elseif abs1_fun(D[i,i]) <= tol + nz += 1 + end + i += 1 + end + end; end + + return np, nn, nz +end + + +""" + bunchkaufman_native!(A, rook::Bool=false; check = true) -> BunchKaufman + +`bunchkaufman_native!` is the same as [`bunchkaufman!`](@ref), but it performs +the factorization in native Julia code instead of calling LAPACK. +""" +function bunchkaufman_native!(A::AbstractMatrix{TS}, + rook::Bool = false; + check::Bool = true, + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + if A isa RealHermSymComplexSym{TR} + syhe = 'S' + elseif ishermitian(A) + syhe = 'H' + elseif issymmetric(A) + syhe = 'S' + else + throw(ArgumentError("Bunch-Kaufman decomposition is only valid for " * + "symmetric or Hermitian matrices")) + end + if A isa HermOrSym + Adata = A.data + uplo = A.uplo + else + Adata = A + uplo = 'U' + end + LD, ipiv, info = generic_bunchkaufman!(uplo, Adata, syhe, rook) + check && checknonsingular(info) + return BunchKaufman(LD, ipiv, uplo, syhe == 'S', rook, info) +end + + +""" +Overload 'bunchkaufman.jl' methods through multiple dispatch +""" + +function bunchkaufman!(A::AbstractMatrix{TS}, + rook::Bool = false; + check::Bool = true + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + return bunchkaufman_native!(A, rook; check) +end + +function bunchkaufman(A::AbstractMatrix{TS}, + rook::Bool = false; + check::Bool = true + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + return bunchkaufman!(eigencopy_oftype(A, TS), rook; check) +end + +function bunchkaufman(A::AbstractMatrix{TS}, + rook::Bool = false; + check::Bool = true + ) where TS <:Union{TI, Complex{TI}} where TI <: Integer + + # Identity whether matrix is symmetric or Hermitian or none + if A isa Symmetric + TA = Symmetric + elseif A isa Hermitian + TA = Hermitian + else + TA = Nothing + end + + # Create a rational copy of input integer matrix, as the Bunch-Kaufman + # algorithm is closed over the rationals but not over the integers. + # We promote input to BigInt to avoid overflow problems + if TA == Nothing + if TS <: Integer + M = Rational{BigInt}.(eigencopy_oftype(A, TS)) + else + M = Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS)) + end + else + if TS <: Integer + M = TA(Rational{BigInt}.(eigencopy_oftype(A, TS)), Symbol(A.uplo)) + else + M = TA(Complex{Rational{BigInt}}.(eigencopy_oftype(A, TS)), + Symbol(A.uplo)) + end + end + + return bunchkaufman_native!(M, rook; check) +end + +function ldiv!(B::BunchKaufman{TS}, + R::AbstractVecOrMat{TS} + ) where TS <: ClosedScalar{TR} where TR <: ClosedReal + return generic_bksolve!(B, R) +end + +function inv(B::BunchKaufman{TS}) where TS <: ClosedScalar{TR} where TR <: ClosedReal + # I don't think there's value in implementing tha LAPACK in-place inverse + # functions `dsytri`, `chetri`, etc., unless of course an efficient + # in-place inverse function `inv!` is needed. + # TODO: reduce the operation count of the inverse by not computing the + # lower/upper triangular part. + if issymmetric(B) + return copytri!(B \ I, B.uplo) + else + return copytri!(B \ I, B.uplo, true) + end +end diff --git a/stdlib/LinearAlgebra/test/bunchkaufman.jl b/stdlib/LinearAlgebra/test/bunchkaufman.jl index 613e4d09a3cc6..d2305844db63e 100644 --- a/stdlib/LinearAlgebra/test/bunchkaufman.jl +++ b/stdlib/LinearAlgebra/test/bunchkaufman.jl @@ -21,9 +21,24 @@ a2img = randn(n,n)/2 breal = randn(n,2)/2 bimg = randn(n,2)/2 -@testset "$eltya argument A" for eltya in (Float32, Float64, ComplexF32, ComplexF64, Int) - a = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(areal, aimg) : areal) - a2 = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(a2real, a2img) : a2real) +areint = rand(1:7, n, n) +aimint = rand(1:7, n, n) +a2reint = rand(1:7, n, n) +a2imint = rand(1:7, n, n) +breint = rand(1:5, n, 2) +bimint = rand(1:5, n, 2) + +@testset "$eltya argument A" for eltya in (Float32, Float64, ComplexF32, ComplexF64, Int, ### + Float16, Complex{Float16}, BigFloat, Complex{BigFloat}, Complex{Int}, BigInt, + Complex{BigInt}, Rational{BigInt}, Complex{Rational{BigInt}}) + # a = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(areal, aimg) : areal) + # a2 = eltya == Int ? rand(1:7, n, n) : convert(Matrix{eltya}, eltya <: Complex ? complex.(a2real, a2img) : a2real) + a = convert(Matrix{eltya}, eltya <: Complex ? (real(eltya) <: AbstractFloat ? + complex.(areal, aimg) : complex.(areint, aimint)) : (eltya <: AbstractFloat ? + areal : areint)) + a2 = convert(Matrix{eltya}, eltya <: Complex ? (real(eltya) <: AbstractFloat ? + complex.(a2real, a2img) : complex.(a2reint, a2imint)) : (eltya <: AbstractFloat ? + a2real : a2reint)) asym = transpose(a) + a # symmetric indefinite aher = a' + a # Hermitian indefinite apd = a' * a # Positive-definite @@ -34,9 +49,39 @@ bimg = randn(n,2)/2 view(apd , 1:n, 1:n))) ε = εa = eps(abs(float(one(eltya)))) + # Inertia tests + @testset "$uplo Bunch-Kaufman factor inertia" for uplo in (:L, :U) + @testset "rook pivoting: $rook" for rook in (false, true) + test_list = eltya <: Complex ? (Hermitian(aher, uplo), Hermitian(apd, uplo)) : + (Symmetric(transpose(a) + a, uplo), Hermitian(aher, uplo), + Hermitian(apd, uplo)) + ελ = n*max(eps(Float64), εa) # zero-eigenvalue threshold + ελ = typeof(Integer(one(real(eltya)))) <: Signed ? Rational{BigInt}(ελ) : + real(eltya(ελ)) + for M in test_list + bc = bunchkaufman(M, rook) + D = bc.D + λ = real(eltya <: Complex ? eigen(ComplexF64.(D)).values : + eigen(Float64.(D)).values) + σ₁ = norm(λ, Inf) + np = sum(λ .> ελ*σ₁) + nn = sum(λ .< -ελ*σ₁) + nz = n - np - nn + if real(eltya) <: AbstractFloat + @test inertia(bc) == (np, nn, nz) + else + @test inertia(bc; rtol=ελ) == (np, nn, nz) + end + end + end + end + # check that factorize gives a Bunch-Kaufman - @test isa(factorize(asym), LinearAlgebra.BunchKaufman) - @test isa(factorize(aher), LinearAlgebra.BunchKaufman) + if eltya <: Union{Float32, Float64, ComplexF32, ComplexF64, Int} + # Default behaviour only uses Bunch-Kaufman for these types, for now. + @test isa(factorize(asym), LinearAlgebra.BunchKaufman) + @test isa(factorize(aher), LinearAlgebra.BunchKaufman) + end @testset "$uplo Bunch-Kaufman factor of indefinite matrix" for uplo in (:L, :U) bc1 = bunchkaufman(Hermitian(aher, uplo)) @test LinearAlgebra.issuccess(bc1) @@ -89,15 +134,25 @@ bimg = randn(n,2)/2 @test Base.propertynames(bc1) == (:p, :P, :L, :U, :D) end - @testset "$eltyb argument B" for eltyb in (Float32, Float64, ComplexF32, ComplexF64, Int) - b = eltyb == Int ? rand(1:5, n, 2) : convert(Matrix{eltyb}, eltyb <: Complex ? complex.(breal, bimg) : breal) + @testset "$eltyb argument B" for eltyb in (Float32, Float64, ComplexF32, ComplexF64, Int, ### + Float16, Complex{Float16}, BigFloat, Complex{BigFloat}, Complex{Int}, BigInt, + Complex{BigInt}, Rational{BigInt}, Complex{Rational{BigInt}}) + # b = eltyb == Int ? rand(1:5, n, 2) : convert(Matrix{eltyb}, eltyb <: Complex ? complex.(breal, bimg) : breal) + b = convert(Matrix{eltyb}, eltyb <: Complex ? (real(eltyb) <: AbstractFloat ? + complex.(breal, bimg) : complex.(breint, bimint)) : (eltyb <: AbstractFloat ? + breal : breint)) for b in (b, view(b, 1:n, 1:2)) εb = eps(abs(float(one(eltyb)))) ε = max(εa,εb) + epsc = eltya <: Complex ? sqrt(2)*n : n # tolerance scale @testset "$uplo Bunch-Kaufman factor of indefinite matrix" for uplo in (:L, :U) bc1 = bunchkaufman(Hermitian(aher, uplo)) - @test aher*(bc1\b) ≈ b atol=1000ε + # @test aher*(bc1\b) ≈ b atol=1000ε + cda = eltya <: Complex ? cond(ComplexF64.(aher)) : cond(Float64.(aher)) + cda = real(eltya) <: AbstractFloat ? real(eltya(cda)) : cda + @test norm(aher*(bc1\b) - b) <= epsc*sqrt(eps(cda))*max( + norm(aher*(bc1\b)), norm(b)) end @testset "$uplo Bunch-Kaufman factors of a pos-def matrix" for uplo in (:U, :L) @@ -112,8 +167,14 @@ bimg = randn(n,2)/2 @test logdet(bc2) ≈ log(det(bc2)) @test logabsdet(bc2)[1] ≈ log(abs(det(bc2))) @test logabsdet(bc2)[2] == sign(det(bc2)) - @test inv(bc2)*apd ≈ Matrix(I, n, n) - @test apd*(bc2\b) ≈ b rtol=eps(cond(apd)) + # @test inv(bc2)*apd ≈ Matrix(I, n, n) rtol=Base.rtoldefault(real(eltya)) + # @test apd*(bc2\b) ≈ b rtol=eps(cond(apd)) + @test norm(inv(bc2)*apd - Matrix(I, n, n)) <= epsc*Base.rtoldefault( + real(eltya))*max(norm(inv(bc2)*apd), norm(Matrix(I, n, n))) + cda = eltya <: Complex ? cond(ComplexF64.(apd)) : cond(Float64.(apd)) + cda = real(eltya) <: AbstractFloat ? real(eltya(cda)) : cda + @test norm(apd*(bc2\b) - b) <= epsc*sqrt(eps(cda))*max( + norm(apd*(bc2\b)), norm(b)) @test ishermitian(bc2) @test !issymmetric(bc2) || eltya <: Real end From 2091058c5a2be7b70f3cad0cf1ce0ea7391863b0 Mon Sep 17 00:00:00 2001 From: tecosaur Date: Sat, 30 Dec 2023 20:20:28 +0800 Subject: [PATCH 11/13] Fix :noshift construction of an empty SubString (#51923) --- base/strings/substring.jl | 2 +- test/strings/basic.jl | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index dfd8770b08d47..ffa78c00cb885 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -37,7 +37,7 @@ struct SubString{T<:AbstractString} <: AbstractString return new(s, i-1, nextind(s,j)-i) end function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString - @boundscheck begin + @boundscheck if !(i == j == 0) si, sj = i + 1, prevind(s, j + i + 1) @inbounds isvalid(s, si) || string_index_err(s, si) @inbounds isvalid(s, sj) || string_index_err(s, sj) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 225a41bf12be9..87d812c5bf201 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -203,6 +203,12 @@ end @test (@views (x[3], x[1:2], x[[1,4]])) isa Tuple{Char, SubString, String} @test (@views (x[3], x[1:2], x[[1,4]])) == ('c', "ab", "ad") end + + @testset ":noshift constructor" begin + @test SubString("", 0, 0, Val(:noshift)) == "" + @test SubString("abcd", 0, 1, Val(:noshift)) == "a" + @test SubString("abcd", 0, 4, Val(:noshift)) == "abcd" + end end From fe0db7d9474781ee949c7927f806214c7fc00a9a Mon Sep 17 00:00:00 2001 From: Jian Fang <131804380+JianFangAtRai@users.noreply.github.com> Date: Sat, 30 Dec 2023 06:46:53 -0800 Subject: [PATCH 12/13] heap snapshot: add gc roots and gc finalist roots to fix unrooted nodes (#52618) --- src/gc-heap-snapshot.cpp | 124 +++++++++++++++++++++++++++++++-------- src/gc-heap-snapshot.h | 33 +++++++++-- src/gc.c | 33 +++++++++-- src/gc.h | 2 +- 4 files changed, 157 insertions(+), 35 deletions(-) diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp index 1875a22f64ff6..24df141d053f4 100644 --- a/src/gc-heap-snapshot.cpp +++ b/src/gc-heap-snapshot.cpp @@ -3,6 +3,7 @@ #include "gc-heap-snapshot.h" #include "julia_internal.h" +#include "julia_assert.h" #include "gc.h" #include "llvm/ADT/SmallVector.h" @@ -12,8 +13,11 @@ #include #include #include +#include +#include using std::string; +using std::set; using std::ostringstream; using std::pair; using std::make_pair; @@ -116,6 +120,8 @@ struct HeapSnapshot { DenseMap node_ptr_to_index_map; size_t num_edges = 0; // For metadata, updated as you add each edge. Needed because edges owned by nodes. + size_t _gc_root_idx = 1; // node index of the GC roots node + size_t _gc_finlist_root_idx = 2; // node index of the GC finlist roots node }; // global heap snapshot, mutated by garbage collector @@ -128,13 +134,13 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one static inline void _record_gc_edge(const char *edge_type, jl_value_t *a, jl_value_t *b, size_t name_or_index) JL_NOTSAFEPOINT; void _record_gc_just_edge(const char *edge_type, Node &from_node, size_t to_idx, size_t name_or_idx) JL_NOTSAFEPOINT; -void _add_internal_root(HeapSnapshot *snapshot); +void _add_synthetic_root_entries(HeapSnapshot *snapshot); JL_DLLEXPORT void jl_gc_take_heap_snapshot(ios_t *stream, char all_one) { HeapSnapshot snapshot; - _add_internal_root(&snapshot); + _add_synthetic_root_entries(&snapshot); jl_mutex_lock(&heapsnapshot_lock); @@ -156,10 +162,12 @@ JL_DLLEXPORT void jl_gc_take_heap_snapshot(ios_t *stream, char all_one) serialize_heap_snapshot((ios_t*)stream, snapshot, all_one); } -// adds a node at id 0 which is the "uber root": -// a synthetic node which points to all the GC roots. -void _add_internal_root(HeapSnapshot *snapshot) +// mimicking https://github.com/nodejs/node/blob/5fd7a72e1c4fbaf37d3723c4c81dce35c149dc84/deps/v8/src/profiler/heap-snapshot-generator.cc#L212 +// add synthetic nodes for the uber root, the GC roots, and the GC finalizer list roots +void _add_synthetic_root_entries(HeapSnapshot *snapshot) { + // adds a node at id 0 which is the "uber root": + // a synthetic node which points to all the GC roots. Node internal_root{ snapshot->node_types.find_or_create_string_id("synthetic"), snapshot->names.find_or_create_string_id(""), // name @@ -170,6 +178,44 @@ void _add_internal_root(HeapSnapshot *snapshot) SmallVector() // outgoing edges }; snapshot->nodes.push_back(internal_root); + + // Add a node for the GC roots + snapshot->_gc_root_idx = snapshot->nodes.size(); + Node gc_roots{ + snapshot->node_types.find_or_create_string_id("synthetic"), + snapshot->names.find_or_create_string_id("GC roots"), // name + snapshot->_gc_root_idx, // id + 0, // size + 0, // size_t trace_node_id (unused) + 0, // int detachedness; // 0 - unknown, 1 - attached; 2 - detached + SmallVector() // outgoing edges + }; + snapshot->nodes.push_back(gc_roots); + snapshot->nodes.front().edges.push_back(Edge{ + snapshot->edge_types.find_or_create_string_id("internal"), + snapshot->names.find_or_create_string_id("GC roots"), // edge label + snapshot->_gc_root_idx // to + }); + snapshot->num_edges += 1; + + // add a node for the gc finalizer list roots + snapshot->_gc_finlist_root_idx = snapshot->nodes.size(); + Node gc_finlist_roots{ + snapshot->node_types.find_or_create_string_id("synthetic"), + snapshot->names.find_or_create_string_id("GC finalizer list roots"), // name + snapshot->_gc_finlist_root_idx, // id + 0, // size + 0, // size_t trace_node_id (unused) + 0, // int detachedness; // 0 - unknown, 1 - attached; 2 - detached + SmallVector() // outgoing edges + }; + snapshot->nodes.push_back(gc_finlist_roots); + snapshot->nodes.front().edges.push_back(Edge{ + snapshot->edge_types.find_or_create_string_id("internal"), + snapshot->names.find_or_create_string_id("GC finlist roots"), // edge label + snapshot->_gc_finlist_root_idx // to + }); + snapshot->num_edges += 1; } // mimicking https://github.com/nodejs/node/blob/5fd7a72e1c4fbaf37d3723c4c81dce35c149dc84/deps/v8/src/profiler/heap-snapshot-generator.cc#L597-L597 @@ -327,6 +373,26 @@ void _gc_heap_snapshot_record_root(jl_value_t *root, char *name) JL_NOTSAFEPOINT _record_gc_just_edge("internal", internal_root, to_node_idx, edge_label); } +void _gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT +{ + record_node_to_gc_snapshot(root); + + auto from_node_idx = g_snapshot->_gc_root_idx; + auto to_node_idx = record_node_to_gc_snapshot(root); + auto edge_label = g_snapshot->names.find_or_create_string_id(name); + _record_gc_just_edge("internal", g_snapshot->nodes[from_node_idx], to_node_idx, edge_label); +} + +void _gc_heap_snapshot_record_finlist(jl_value_t *obj, size_t index) JL_NOTSAFEPOINT +{ + auto from_node_idx = g_snapshot->_gc_finlist_root_idx; + auto to_node_idx = record_node_to_gc_snapshot(obj); + ostringstream ss; + ss << "finlist-" << index; + auto edge_label = g_snapshot->names.find_or_create_string_id(ss.str()); + _record_gc_just_edge("internal", g_snapshot->nodes[from_node_idx], to_node_idx, edge_label); +} + // Add a node to the heap snapshot representing a Julia stack frame. // Each task points at a stack frame, which points at the stack frame of // the function it's currently calling, forming a linked list. @@ -393,27 +459,19 @@ void _gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_value_t *to, void g_snapshot->names.find_or_create_string_id(path)); } -void _gc_heap_snapshot_record_module_to_binding(jl_module_t *module, jl_binding_t *binding) JL_NOTSAFEPOINT +void _gc_heap_snapshot_record_module_to_binding(jl_module_t *module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT { - jl_globalref_t *globalref = binding->globalref; - jl_sym_t *name = globalref->name; auto from_node_idx = record_node_to_gc_snapshot((jl_value_t*)module); - auto to_node_idx = record_pointer_to_gc_snapshot(binding, sizeof(jl_binding_t), jl_symbol_name(name)); - - jl_value_t *value = jl_atomic_load_relaxed(&binding->value); - auto value_idx = value ? record_node_to_gc_snapshot(value) : 0; - jl_value_t *ty = jl_atomic_load_relaxed(&binding->ty); - auto ty_idx = ty ? record_node_to_gc_snapshot(ty) : 0; - auto globalref_idx = record_node_to_gc_snapshot((jl_value_t*)globalref); - + auto to_bindings_idx = record_node_to_gc_snapshot(bindings); + auto to_bindingkeyset_idx = record_node_to_gc_snapshot(bindingkeyset); auto &from_node = g_snapshot->nodes[from_node_idx]; - auto &to_node = g_snapshot->nodes[to_node_idx]; - - _record_gc_just_edge("property", from_node, to_node_idx, g_snapshot->names.find_or_create_string_id("")); - if (value_idx) _record_gc_just_edge("internal", to_node, value_idx, g_snapshot->names.find_or_create_string_id("value")); - if (ty_idx) _record_gc_just_edge("internal", to_node, ty_idx, g_snapshot->names.find_or_create_string_id("ty")); - if (globalref_idx) _record_gc_just_edge("internal", to_node, globalref_idx, g_snapshot->names.find_or_create_string_id("globalref")); -} + if (to_bindings_idx > 0) { + _record_gc_just_edge("internal", from_node, to_bindings_idx, g_snapshot->names.find_or_create_string_id("bindings")); + } + if (to_bindingkeyset_idx > 0) { + _record_gc_just_edge("internal", from_node, to_bindingkeyset_idx, g_snapshot->names.find_or_create_string_id("bindingkeyset")); + } + } void _gc_heap_snapshot_record_internal_array_edge(jl_value_t *from, jl_value_t *to) JL_NOTSAFEPOINT { @@ -492,6 +550,8 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one ios_printf(stream, "\"nodes\":["); bool first_node = true; + // use a set to track the nodes that do not have parents + set orphans; for (const auto &from_node : snapshot.nodes) { if (first_node) { first_node = false; @@ -508,6 +568,14 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one from_node.edges.size(), from_node.trace_node_id, from_node.detachedness); + if (from_node.id != snapshot._gc_root_idx && from_node.id != snapshot._gc_finlist_root_idx) { + // find the node index from the node object pointer + void * ptr = (void*)from_node.id; + size_t n_id = snapshot.node_ptr_to_index_map[ptr]; + orphans.insert(n_id); + } else { + orphans.insert(from_node.id); + } } ios_printf(stream, "],\n"); @@ -525,6 +593,12 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one edge.type, edge.name_or_index, edge.to_node * k_node_number_of_fields); + auto n_id = edge.to_node; + auto it = orphans.find(n_id); + if (it != orphans.end()) { + // remove the node from the orphans if it has at least one incoming edge + orphans.erase(it); + } } } ios_printf(stream, "],\n"); // end "edges" @@ -534,4 +608,8 @@ void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one snapshot.names.print_json_array(stream, true); ios_printf(stream, "}"); + + // remove the uber node from the orphans + orphans.erase(0); + assert(orphans.size() == 0 && "all nodes except the uber node should have at least one incoming edge"); } diff --git a/src/gc-heap-snapshot.h b/src/gc-heap-snapshot.h index 8c3af5b86bec7..1799063825e83 100644 --- a/src/gc-heap-snapshot.h +++ b/src/gc-heap-snapshot.h @@ -20,7 +20,7 @@ void _gc_heap_snapshot_record_task_to_frame_edge(jl_task_t *from, void *to) JL_N void _gc_heap_snapshot_record_frame_to_frame_edge(jl_gcframe_t *from, jl_gcframe_t *to) JL_NOTSAFEPOINT; void _gc_heap_snapshot_record_array_edge(jl_value_t *from, jl_value_t *to, size_t index) JL_NOTSAFEPOINT; void _gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_value_t *to, void* slot) JL_NOTSAFEPOINT; -void _gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_binding_t* binding) JL_NOTSAFEPOINT; +void _gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT; // Used for objects managed by GC, but which aren't exposed in the julia object, so have no // field or index. i.e. they're not reachable from julia code, but we _will_ hit them in // the GC mark phase (so we can check their type tag to get the size). @@ -28,7 +28,10 @@ void _gc_heap_snapshot_record_internal_array_edge(jl_value_t *from, jl_value_t * // Used for objects manually allocated in C (outside julia GC), to still tell the heap snapshot about the // size of the object, even though we're never going to mark that object. void _gc_heap_snapshot_record_hidden_edge(jl_value_t *from, void* to, size_t bytes, uint16_t alloc_type) JL_NOTSAFEPOINT; - +// Used for objects that are reachable from the GC roots +void _gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT; +// Used for objects that are reachable from the finalizer list +void _gc_heap_snapshot_record_finlist(jl_value_t *finlist, size_t index) JL_NOTSAFEPOINT; extern int gc_heap_snapshot_enabled; extern int prev_sweep_full; @@ -60,6 +63,12 @@ static inline void gc_heap_snapshot_record_root(jl_value_t *root, char *name) JL _gc_heap_snapshot_record_root(root, name); } } +static inline void gc_heap_snapshot_record_array_edge_index(jl_value_t *from, jl_value_t *to, size_t index) JL_NOTSAFEPOINT +{ + if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && from != NULL && to != NULL)) { + _gc_heap_snapshot_record_array_edge(from, to, index); + } +} static inline void gc_heap_snapshot_record_array_edge(jl_value_t *from, jl_value_t **to) JL_NOTSAFEPOINT { if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full)) { @@ -73,10 +82,10 @@ static inline void gc_heap_snapshot_record_object_edge(jl_value_t *from, jl_valu } } -static inline void gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_binding_t* binding) JL_NOTSAFEPOINT +static inline void gc_heap_snapshot_record_module_to_binding(jl_module_t* module, jl_value_t *bindings, jl_value_t *bindingkeyset) JL_NOTSAFEPOINT { - if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full)) { - _gc_heap_snapshot_record_module_to_binding(module, binding); + if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full) && bindings != NULL && bindingkeyset != NULL) { + _gc_heap_snapshot_record_module_to_binding(module, bindings, bindingkeyset); } } @@ -94,6 +103,20 @@ static inline void gc_heap_snapshot_record_hidden_edge(jl_value_t *from, void* t } } +static inline void gc_heap_snapshot_record_gc_roots(jl_value_t *root, char *name) JL_NOTSAFEPOINT +{ + if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && root != NULL)) { + _gc_heap_snapshot_record_gc_roots(root, name); + } +} + +static inline void gc_heap_snapshot_record_finlist(jl_value_t *finlist, size_t index) JL_NOTSAFEPOINT +{ + if (__unlikely(gc_heap_snapshot_enabled && prev_sweep_full && finlist != NULL)) { + _gc_heap_snapshot_record_finlist(finlist, index); + } +} + // --------------------------------------------------------------------- // Functions to call from Julia to take heap snapshot // --------------------------------------------------------------------- diff --git a/src/gc.c b/src/gc.c index 0c756d3d2880b..57bd81baf4827 100644 --- a/src/gc.c +++ b/src/gc.c @@ -2358,9 +2358,10 @@ STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_ch break; } case GC_finlist_chunk: { + jl_value_t *fl_parent = c->parent; jl_value_t **fl_begin = c->begin; jl_value_t **fl_end = c->end; - gc_mark_finlist_(mq, fl_begin, fl_end); + gc_mark_finlist_(mq, fl_parent, fl_begin, fl_end); break; } default: { @@ -2458,6 +2459,7 @@ STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent jl_value_t *bindingkeyset = (jl_value_t *)jl_atomic_load_relaxed(&mb_parent->bindingkeyset); gc_assert_parent_validity((jl_value_t *)mb_parent, bindingkeyset); gc_try_claim_and_push(mq, bindingkeyset, &nptr); + gc_heap_snapshot_record_module_to_binding(mb_parent, bindings, bindingkeyset); gc_assert_parent_validity((jl_value_t *)mb_parent, (jl_value_t *)mb_parent->parent); gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr); size_t nusings = mb_parent->usings.len; @@ -2476,7 +2478,7 @@ STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent } } -void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) { jl_value_t *new_obj; // Decide whether need to chunk finlist @@ -2486,8 +2488,10 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t * gc_chunkqueue_push(mq, &c); fl_end = fl_begin + GC_CHUNK_BATCH_SIZE; } + size_t i = 0; for (; fl_begin < fl_end; fl_begin++) { - new_obj = *fl_begin; + jl_value_t **slot = fl_begin; + new_obj = *slot; if (__unlikely(new_obj == NULL)) continue; if (gc_ptr_tag(new_obj, 1)) { @@ -2498,6 +2502,13 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t * if (gc_ptr_tag(new_obj, 2)) continue; gc_try_claim_and_push(mq, new_obj, NULL); + if (fl_parent != NULL) { + gc_heap_snapshot_record_array_edge(fl_parent, slot); + } else { + // This is a list of objects following the same format as a finlist + // if `fl_parent` is NULL + gc_heap_snapshot_record_finlist(new_obj, ++i); + } } } @@ -2509,7 +2520,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) return; jl_value_t **fl_begin = (jl_value_t **)list->items + start; jl_value_t **fl_end = (jl_value_t **)list->items + len; - gc_mark_finlist_(mq, fl_begin, fl_end); + gc_mark_finlist_(mq, NULL, fl_begin, fl_end); } JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) @@ -3158,28 +3169,38 @@ static void gc_mark_roots(jl_gc_markqueue_t *mq) { // modules gc_try_claim_and_push(mq, jl_main_module, NULL); - gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module"); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_main_module, "main_module"); // invisible builtin values gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_an_empty_vec_any, "an_empty_vec_any"); gc_try_claim_and_push(mq, jl_module_init_order, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_module_init_order, "module_init_order"); for (size_t i = 0; i < jl_current_modules.size; i += 2) { if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL); - gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module"); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_current_modules.table[i], "top level module"); } } gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_anytuple_type_type, "anytuple_type_type"); for (size_t i = 0; i < N_CALL_CACHE; i++) { jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); gc_try_claim_and_push(mq, v, NULL); + gc_heap_snapshot_record_array_edge_index((jl_value_t*)jl_anytuple_type_type, (jl_value_t*)v, i); } gc_try_claim_and_push(mq, jl_all_methods, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_all_methods, "all_methods"); gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)_jl_debug_method_invalidation, "debug_method_invalidation"); // constants gc_try_claim_and_push(mq, jl_emptytuple_type, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_emptytuple_type, "emptytuple_type"); gc_try_claim_and_push(mq, cmpswap_names, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)cmpswap_names, "cmpswap_names"); gc_try_claim_and_push(mq, jl_global_roots_list, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_global_roots_list, "global_roots_list"); gc_try_claim_and_push(mq, jl_global_roots_keyset, NULL); + gc_heap_snapshot_record_gc_roots((jl_value_t*)jl_global_roots_keyset, "global_roots_keyset"); } // find unmarked objects that need to be finalized from the finalizer list "list". diff --git a/src/gc.h b/src/gc.h index 217b8050e40ac..9de67f6e7c679 100644 --- a/src/gc.h +++ b/src/gc.h @@ -468,7 +468,7 @@ extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); -void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT; void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_loop_serial(jl_ptls_t ptls); From 89cae45ea4f75ce81fff08ca6e731e72e838f4ad Mon Sep 17 00:00:00 2001 From: Jishnu Bhattacharya Date: Sat, 30 Dec 2023 22:38:45 +0530 Subject: [PATCH 13/13] Optimized arithmetic methods for strided triangular matrices (#52571) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses broadcasting for operations like `A::UpperTriangular + B::UpperTriangular` in case the parents are `StridedMatrix`es. Looping only over the triangular part is usually faster for large matrices, where presumably memory is the bottleneck. Some performance comparisons, using ```julia julia> U = UpperTriangular(rand(1000,1000)); julia> U1 = UnitUpperTriangular(rand(size(U)...)); ``` | Operation | master | PR | | --------------- | ---------- | ----- | |`-U` |`1.011 ms (3 allocations: 7.63 MiB)` |`559.680 μs (3 allocations: 7.63 MiB)` | |`U + U`/`U - U` |`971.740 μs (3 allocations: 7.63 MiB)` | `560.063 μs (3 allocations: 7.63 MiB)` | |`U + U1`/`U - U1` |`3.014 ms (9 allocations: 22.89 MiB)` | `944.772 μs (3 allocations: 7.63 MiB)` | |`U1 + U1` |`4.509 ms (12 allocations: 30.52 MiB)` | `1.687 ms (3 allocations: 7.63 MiB)` | |`U1 - U1` |`3.357 ms (9 allocations: 22.89 MiB)` | `1.763 ms (3 allocations: 7.63 MiB)` | I've retained the existing methods as fallback, in case there's current code that works without broadcasting. --- stdlib/LinearAlgebra/src/triangular.jl | 67 +++++++++++++++++++------ stdlib/LinearAlgebra/test/triangular.jl | 54 ++++++++++++++++++++ 2 files changed, 107 insertions(+), 14 deletions(-) diff --git a/stdlib/LinearAlgebra/src/triangular.jl b/stdlib/LinearAlgebra/src/triangular.jl index b18b4f0159a67..64e1b320f19dd 100644 --- a/stdlib/LinearAlgebra/src/triangular.jl +++ b/stdlib/LinearAlgebra/src/triangular.jl @@ -48,6 +48,7 @@ for t in (:LowerTriangular, :UnitLowerTriangular, :UpperTriangular, :UnitUpperTr real(A::$t{<:Real}) = A real(A::$t{<:Complex}) = (B = real(A.data); $t(B)) + real(A::$t{<:Complex, <:StridedMaybeAdjOrTransMat}) = $t(real.(A)) end end @@ -156,8 +157,26 @@ const UpperOrLowerTriangular{T,S} = Union{UpperOrUnitUpperTriangular{T,S}, Lower imag(A::UpperTriangular) = UpperTriangular(imag(A.data)) imag(A::LowerTriangular) = LowerTriangular(imag(A.data)) -imag(A::UnitLowerTriangular) = LowerTriangular(tril!(imag(A.data),-1)) -imag(A::UnitUpperTriangular) = UpperTriangular(triu!(imag(A.data),1)) +imag(A::UpperTriangular{<:Any,<:StridedMaybeAdjOrTransMat}) = imag.(A) +imag(A::LowerTriangular{<:Any,<:StridedMaybeAdjOrTransMat}) = imag.(A) +function imag(A::UnitLowerTriangular) + L = LowerTriangular(A.data) + Lim = similar(L) # must be mutable to set diagonals to zero + Lim .= imag.(L) + for i in 1:size(Lim,1) + Lim[i,i] = zero(Lim[i,i]) + end + return Lim +end +function imag(A::UnitUpperTriangular) + U = UpperTriangular(A.data) + Uim = similar(U) # must be mutable to set diagonals to zero + Uim .= imag.(U) + for i in 1:size(Uim,1) + Uim[i,i] = zero(Uim[i,i]) + end + return Uim +end Array(A::AbstractTriangular) = Matrix(A) parent(A::UpperOrLowerTriangular) = A.data @@ -481,6 +500,11 @@ function -(A::UnitUpperTriangular) UpperTriangular(Anew) end +# use broadcasting if the parents are strided, where we loop only over the triangular part +for TM in (:LowerTriangular, :UpperTriangular) + @eval -(A::$TM{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast(-, A) +end + tr(A::LowerTriangular) = tr(A.data) tr(A::UnitLowerTriangular) = size(A, 1) * oneunit(eltype(A)) tr(A::UpperTriangular) = tr(A.data) @@ -719,6 +743,16 @@ fillstored!(A::UnitUpperTriangular, x) = (fillband!(A.data, x, 1, size(A,2)-1); -(A::UnitLowerTriangular, B::UnitLowerTriangular) = LowerTriangular(tril(A.data, -1) - tril(B.data, -1)) -(A::AbstractTriangular, B::AbstractTriangular) = copyto!(similar(parent(A)), A) - copyto!(similar(parent(B)), B) +# use broadcasting if the parents are strided, where we loop only over the triangular part +for op in (:+, :-) + for TM1 in (:LowerTriangular, :UnitLowerTriangular), TM2 in (:LowerTriangular, :UnitLowerTriangular) + @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B) + end + for TM1 in (:UpperTriangular, :UnitUpperTriangular), TM2 in (:UpperTriangular, :UnitUpperTriangular) + @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B) + end +end + ###################### # BlasFloat routines # ###################### @@ -918,47 +952,52 @@ end for (t, unitt) in ((UpperTriangular, UnitUpperTriangular), (LowerTriangular, UnitLowerTriangular)) + tstrided = t{<:Any, <:StridedMaybeAdjOrTransMat} @eval begin (*)(A::$t, x::Number) = $t(A.data*x) + (*)(A::$tstrided, x::Number) = A .* x function (*)(A::$unitt, x::Number) - B = A.data*x + B = $t(A.data)*x for i = 1:size(A, 1) - B[i,i] = x + B.data[i,i] = x end - $t(B) + return B end (*)(x::Number, A::$t) = $t(x*A.data) + (*)(x::Number, A::$tstrided) = x .* A function (*)(x::Number, A::$unitt) - B = x*A.data + B = x*$t(A.data) for i = 1:size(A, 1) - B[i,i] = x + B.data[i,i] = x end - $t(B) + return B end (/)(A::$t, x::Number) = $t(A.data/x) + (/)(A::$tstrided, x::Number) = A ./ x function (/)(A::$unitt, x::Number) - B = A.data/x + B = $t(A.data)/x invx = inv(x) for i = 1:size(A, 1) - B[i,i] = invx + B.data[i,i] = invx end - $t(B) + return B end (\)(x::Number, A::$t) = $t(x\A.data) + (\)(x::Number, A::$tstrided) = x .\ A function (\)(x::Number, A::$unitt) - B = x\A.data + B = x\$t(A.data) invx = inv(x) for i = 1:size(A, 1) - B[i,i] = invx + B.data[i,i] = invx end - $t(B) + return B end end end diff --git a/stdlib/LinearAlgebra/test/triangular.jl b/stdlib/LinearAlgebra/test/triangular.jl index 74e1028bf109d..b60efba1d941a 100644 --- a/stdlib/LinearAlgebra/test/triangular.jl +++ b/stdlib/LinearAlgebra/test/triangular.jl @@ -526,6 +526,23 @@ for elty1 in (Float32, Float64, BigFloat, ComplexF32, ComplexF64, Complex{BigFlo end end +@testset "non-strided arithmetic" begin + for (T,T1) in ((UpperTriangular, UnitUpperTriangular), (LowerTriangular, UnitLowerTriangular)) + U = T(reshape(1:16, 4, 4)) + M = Matrix(U) + @test -U == -M + U1 = T1(reshape(1:16, 4, 4)) + M1 = Matrix(U1) + @test -U1 == -M1 + for op in (+, -) + for (A, MA) in ((U, M), (U1, M1)), (B, MB) in ((U, M), (U1, M1)) + @test op(A, B) == op(MA, MB) + end + end + @test imag(U) == zero(U) + end +end + # Matrix square root Atn = UpperTriangular([-1 1 2; 0 -2 2; 0 0 -3]) Atp = UpperTriangular([1 1 2; 0 2 2; 0 0 3]) @@ -894,6 +911,11 @@ end U = UT(F) @test -U == -Array(U) end + + F = FillArrays.Fill(3im, (4,4)) + for U in (UnitUpperTriangular(F), UnitLowerTriangular(F)) + @test imag(F) == imag(collect(F)) + end end @testset "error paths" begin @@ -911,4 +933,36 @@ end end end +@testset "arithmetic with partly uninitialized matrices" begin + @testset "$(typeof(A))" for A in (Matrix{BigFloat}(undef,2,2), Matrix{Complex{BigFloat}}(undef,2,2)') + A[1,1] = A[2,2] = A[2,1] = 4 + B = Matrix{eltype(A)}(undef, size(A)) + for MT in (LowerTriangular, UnitLowerTriangular) + L = MT(A) + B .= 0 + copyto!(B, L) + @test L * 2 == 2 * L == 2B + @test L/2 == B/2 + @test 2\L == 2\B + @test real(L) == real(B) + @test imag(L) == imag(B) + end + end + + @testset "$(typeof(A))" for A in (Matrix{BigFloat}(undef,2,2), Matrix{Complex{BigFloat}}(undef,2,2)') + A[1,1] = A[2,2] = A[1,2] = 4 + B = Matrix{eltype(A)}(undef, size(A)) + for MT in (UpperTriangular, UnitUpperTriangular) + U = MT(A) + B .= 0 + copyto!(B, U) + @test U * 2 == 2 * U == 2B + @test U/2 == B/2 + @test 2\U == 2\B + @test real(U) == real(B) + @test imag(U) == imag(B) + end + end +end + end # module TestTriangular