diff --git a/docs/source/deviation.rst b/docs/source/deviation.rst new file mode 100644 index 00000000000000..bce1f9b074a811 --- /dev/null +++ b/docs/source/deviation.rst @@ -0,0 +1,52 @@ +Computing Deviations +===================== + +This package provides functions to compute various deviations between arrays in a variety of ways: + +.. function:: sqL2dist(a, b) + + Squared L2 distance between ``a`` and ``b``, as :math:`\sum_{i=1}^n |a_i - b_i|^2`. + +.. function:: L2dist(a, b) + + L2 distance between ``a`` and ``b``, *i.e* ``sqrt(sqL2dist(a, b))``. + +.. function:: L1dist(a, b) + + L1 distance between ``a`` and ``b``, as :math:`\sum_{i=1}^n |a_i - b_i|`. + +.. function:: Linfdist(a, b) + + Linf distance between ``a`` and ``b``, as :math:`\mathrm{\mathop{max}}_{i=1:n} |a_i - b_i|`. + +.. function:: gkldiv(a, b) + + Generalized Kullback-Leibler divergence between two arrays ``a`` and ``b``, defined as + :math:`\sum_{i=1}^n (a_i * \log(a_i/b_i) - a_i + b_i)`. + + **Note:** When ``sum(a) == 1`` and ``sum(b) == 1``, it reduces to the KL-divergence in standard sense. + +.. function:: meanad(a, b) + + Mean absolute deviation between ``a`` and ``b``, *i.e.* ``mean(abs(a - b))``. + +.. function:: maxad(a, b) + + Maximum absolute deviation between ``a`` and ``b``, *i.e.* ``maximum(abs(a - b))``. + +.. function:: msd(a, b) + + Mean squared deviation between ``a`` and ``b``, *i.e.* ``mean(abs2(a - b))``. + +.. function:: rmsd(a, b[; normalize={true|false}]) + + Root mean squared deviation, *i.e.* ``sqrt(msd(a, b))``. + + The keyword argument ``normalize`` is default to ``false``. If it is set to ``true``, the result is normalized by ``(maximum(a) - minimum(a)``. + +.. function:: psnr(a, b, maxv) + + Peak signal-to-noise ratio, *i.e.* ``10 * log10(maxv^2 / msd(a, b))``. + +**Note:** all these functions are implemented in a reasonably efficient way without creating any temporary arrays in the middle. + diff --git a/docs/source/index.rst b/docs/source/index.rst index 7abd62a786517c..d48849445c0037 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,6 +15,7 @@ Welcome to StatsBase.jl's Documentation! weightvec.rst means.rst scalarstats.rst + deviation.rst cov.rst counts.rst ranking.rst diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 3ee65892fdf5cc..734e54b233fe7e 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -65,6 +65,18 @@ module StatsBase summarystats, # summary statistics describe, # print the summary statistics + # deviation + sqL2dist, # squared L2 distance between two arrays + L2dist, # L2 distance between two arrays + L1dist, # L1 distance between two arrays + Linfdist, # L-inf distance between two arrays + gkldiv, # (Generalized) Kullback-Leibler divergence between two vectors + meanad, # mean absolute deviation + maxad, # maximum absolute deviation + msd, # mean squared deviation + rmsd, # root mean squared deviation + psnr, # peak signal-to-noise ratio (in dB) + # cov scattermat, # scatter matrix (i.e. unnormalized covariance) @@ -143,6 +155,7 @@ module StatsBase include("weights.jl") include("moments.jl") include("scalarstats.jl") + include("deviation.jl") include("cov.jl") include("counts.jl") include("ranking.jl") diff --git a/src/deviation.jl b/src/deviation.jl new file mode 100644 index 00000000000000..b15c80f93d4e56 --- /dev/null +++ b/src/deviation.jl @@ -0,0 +1,80 @@ +# Computing deviation in a variety of ways + +# squared L2 distance +function sqL2dist{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) + n = length(a) + length(b) == n || throw(DimensionMismatch("Input dimension mismatch")) + r = 0.0 + for i = 1:n + @inbounds r += abs2(a[i] - b[i]) + end + return r +end + +# L2 distance +L2dist{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) = sqrt(sqL2dist(a, b)) + +# L1 distance +function L1dist{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) + n = length(a) + length(b) == n || throw(DimensionMismatch("Input dimension mismatch")) + r = 0.0 + for i = 1:n + @inbounds r += abs(a[i] - b[i]) + end + return r +end + +# Linf distance +function Linfdist{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) + n = length(a) + length(b) == n || throw(DimensionMismatch("Input dimension mismatch")) + r = 0.0 + for i = 1:n + @inbounds v = abs(a[i] - b[i]) + if r < v + r = v + end + end + return r +end + +# Generalized KL-divergence +function gkldiv{T<:FloatingPoint}(a::AbstractArray{T}, b::AbstractArray{T}) + n = length(a) + r = 0.0 + for i = 1:n + @inbounds ai = a[i] + @inbounds bi = b[i] + if ai > 0 + r += (ai * log(ai / bi) - ai + bi) + else + r += bi + end + end + return r::Float64 +end + +# MeanAD: mean absolute deviation +meanad{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) = L1dist(a, b) / length(a) + +# MaxAD: maximum absolute deviation +maxad{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) = Linfdist(a, b) + +# MSD: mean squared deviation +msd{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}) = sqL2dist(a, b) / length(a) + +# RMSD: root mean squared deviation +function rmsd{T<:Number}(a::AbstractArray{T}, b::AbstractArray{T}; normalize::Bool=false) + v = sqrt(msd(a, b)) + if normalize + amin, amax = extrema(a) + v /= (amax - amin) + end + return v +end + +# PSNR: peak signal-to-noise ratio +function psnr{T<:Real}(a::AbstractArray{T}, b::AbstractArray{T}, maxv::Real) + 20. * log10(maxv) - 10. * log10(msd(a, b)) +end diff --git a/test/deviation.jl b/test/deviation.jl new file mode 100644 index 00000000000000..0cf54a37fe6a6c --- /dev/null +++ b/test/deviation.jl @@ -0,0 +1,20 @@ +using StatsBase +using Base.Test + +a = rand(5, 6) +b = rand(5, 6) + +@test_approx_eq sqL2dist(a, b) sum(abs2(a - b)) +@test_approx_eq L2dist(a, b) sqrt(sqL2dist(a, b)) +@test_approx_eq L1dist(a, b) sum(abs(a - b)) +@test_approx_eq Linfdist(a, b) maximum(abs(a - b)) + +@test_approx_eq gkldiv(a, b) sum(a .* log(a ./ b) - a + b) + +@test_approx_eq meanad(a, b) mean(abs(a - b)) +@test_approx_eq maxad(a, b) maximum(abs(a - b)) +@test_approx_eq msd(a, b) mean(abs2(a - b)) +@test_approx_eq rmsd(a, b) sqrt(msd(a, b)) +@test_approx_eq rmsd(a, b; normalize=true) rmsd(a, b) / (maximum(a) - minimum(a)) +@test_approx_eq psnr(a, b, 2) 10 * log10(4 / msd(a, b)) + diff --git a/test/runtests.jl b/test/runtests.jl index 9da1a9214d1345..0a09388769a722 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ tests = ["mathfuns", "weights", "moments", "scalarstats", + "deviation", "cov", "counts", "ranking",