diff --git a/Project.toml b/Project.toml index dbb9b0e..49ff5ca 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,7 @@ uuid = "e54bda2e-c571-11ec-9d64-0242ac120002" license = "MIT" desc = "Julia implementation of Modal Decision Trees and Random Forest algorithms" authors = ["Giovanni PAGLIARINI"] -version = "0.5.0" +version = "0.5.1" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/src/ModalCART.jl b/src/ModalCART.jl index dd867fb..15cfc33 100644 --- a/src/ModalCART.jl +++ b/src/ModalCART.jl @@ -226,7 +226,8 @@ function generate_relevant_decisions( idxs, region, grouped_featsaggrsnopss, - grouped_featsnaggrss, + grouped_featsnaggrss; + kwargs... ) out = [] @inbounds for (i_modality, @@ -295,6 +296,7 @@ function generate_relevant_decisions( features_inds, grouped_featsaggrsnopss[i_modality], grouped_featsnaggrss[i_modality], + get(kwargs, :fixnans, false), ) decision_instantiator = _threshold->begin cond = ScalarCondition(metacondition, _threshold) @@ -348,8 +350,8 @@ Base.@propagate_inbounds @inline function optimize_node!( idxs :: AbstractVector{Int}, n_classes :: Int, rng :: Random.AbstractRNG, + kwargs... ) where{P,L<:_Label,D<:AbstractDecision,U,NSubRelationsFunction<:Function,S<:MCARTState} - # Region of idxs to use to perform the split region = node.region _ninstances = length(region) @@ -724,13 +726,15 @@ Base.@propagate_inbounds @inline function optimize_node!( idxs, region, grouped_featsaggrsnopss, - grouped_featsnaggrss, + grouped_featsnaggrss; + kwargs... ) if isa(_is_classification, Val{true}) thresh_domain, additional_info = limit_threshold_domain(aggr_thresholds, Yf, Wf, loss_function, test_op, min_samples_leaf, perform_domain_optimization; n_classes = n_classes, nc = nc, nt = nt) else thresh_domain, additional_info = limit_threshold_domain(aggr_thresholds, Yf, Wf, loss_function, test_op, min_samples_leaf, perform_domain_optimization) end + # Look for the best threshold 'a', as in atoms like "feature >= a" for (_threshold, threshold_info) in zip(thresh_domain, additional_info) decision = decision_instantiator(_threshold) @@ -952,6 +956,7 @@ Base.@propagate_inbounds @inline function optimize_node!( idxs = deepcopy(idxs_copy), n_classes = n_classes, rng = copy(rng), + kwargs... ) end # TODO: evaluate the goodneess of the subtree? @@ -1029,7 +1034,7 @@ end _metaconditions = metaconditions(X) _grouped_metaconditions = SoleData.grouped_metaconditions(_metaconditions, _features) - + # _grouped_metaconditions::AbstractVector{<:AbstractVector{Tuple{<:ScalarMetaCondition}}} # [[(i_metacond, aggregator, metacondition)...]...] @@ -1061,7 +1066,7 @@ end grouped_featsnaggrss = last.(permodality_groups) # Process nodes recursively, using multi-threading - function process_node!(node, rng) + function process_node!(node, rng; kwargs...) # Note: better to spawn rng's beforehand, to preserve reproducibility independently from optimize_node! rng_l = spawn(rng) rng_r = spawn(rng) @@ -1084,17 +1089,17 @@ end idxs = idxs, rng = rng, lookahead = lookahead, - kwargs..., + kwargs... ) # !print_progress || ProgressMeter.update!(p, node.purity) !print_progress || ProgressMeter.next!(p, spinner="⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏") if !node.is_leaf - l = Threads.@spawn process_node!(node.l, rng_l) - r = Threads.@spawn process_node!(node.r, rng_r) + l = Threads.@spawn process_node!(node.l, rng_l; kwargs...) + r = Threads.@spawn process_node!(node.r, rng_r; kwargs...) wait(l), wait(r) end end - @sync Threads.@spawn process_node!(root, rng) + @sync Threads.@spawn process_node!(root, rng; kwargs...) !print_progress || ProgressMeter.finish!(p) @@ -1192,9 +1197,10 @@ end * " lookahead >= 0)") end - if SoleData.hasnans(Xs) - error("This algorithm does not allow NaN values") - end + # fixnans = get(kwargs, :fixnans, false) + # if !fixnans && SoleData.hasnans(Xs) + # error("This algorithm does not allow NaN values") + # end if nothing in Y error("This algorithm does not allow nothing values in Y") @@ -1236,7 +1242,7 @@ function fit_tree( kwargs..., ) where {L<:Union{CLabel,RLabel}, U} # Check validity of the input - check_input(Xs, Y, initconditions, W; profile = profile, lookahead = lookahead, kwargs...) + check_input(Xs, Y, initconditions, W; profile=profile, lookahead=lookahead, kwargs...) # Classification-only: transform labels to categorical form (indexed by integers) n_classes = begin diff --git a/src/build.jl b/src/build.jl index 3b0cb70..cbc9f34 100644 --- a/src/build.jl +++ b/src/build.jl @@ -68,6 +68,7 @@ function build_tree( ############################################################################## rng :: Random.AbstractRNG = Random.GLOBAL_RNG, print_progress :: Bool = true, + kwargs... ) where {L<:Label,U} @assert W isa AbstractVector || W in [nothing, :rebalance, :default] @@ -108,8 +109,8 @@ function build_tree( @assert isnothing(max_depth) || (max_depth >= 0) @assert isnothing(max_modal_depth) || (max_modal_depth >= 0) - fit_tree(X, Y, initconditions, W - ;########################################################################### + fit_tree(X, Y, initconditions, W; + ########################################################################### loss_function = loss_function, lookahead = lookahead, max_depth = max_depth, @@ -127,6 +128,7 @@ function build_tree( ############################################################################ rng = rng, print_progress = print_progress, + kwargs... ) end @@ -162,6 +164,7 @@ function build_forest( rng :: Random.AbstractRNG = Random.GLOBAL_RNG, print_progress :: Bool = true, suppress_parity_warning :: Bool = false, + fixnans :: Bool = false, ) where {L<:Label,U} @assert W isa AbstractVector || W in [nothing, :rebalance, :default] diff --git a/src/interfaces/MLJ.jl b/src/interfaces/MLJ.jl index 8413762..3d9555d 100644 --- a/src/interfaces/MLJ.jl +++ b/src/interfaces/MLJ.jl @@ -69,13 +69,13 @@ depth(t::MDT.DTree) = height(t) ############################################################################################ ############################################################################################ -function MMI.fit(m::SymbolicModel, verbosity::Integer, X, y, var_grouping, classes_seen=nothing, w=nothing) +function MMI.fit(m::SymbolicModel, verbosity::Integer, X, y, var_grouping, classes_seen=nothing, w=nothing; kwargs...) # @show get_kwargs(m, X) model = begin if m isa ModalDecisionTree - MDT.build_tree(X, y, w; get_kwargs(m, X)...) + MDT.build_tree(X, y, w; get_kwargs(m, X)..., kwargs...) elseif m isa ModalRandomForest - MDT.build_forest(X, y, w; get_kwargs(m, X)...) + MDT.build_forest(X, y, w; get_kwargs(m, X)..., kwargs...) else error("Unexpected model type: $(typeof(m))") end @@ -171,8 +171,8 @@ end # DATA FRONT END ############################################################################################ -function MMI.reformat(m::SymbolicModel, X, y, w = nothing; passive_mode = false) - X, var_grouping = wrapdataset(X, m; passive_mode = passive_mode) +function MMI.reformat(m::SymbolicModel, X, y, w = nothing; passive_mode = false, kwargs...) + X, var_grouping = wrapdataset(X, m; passive_mode = passive_mode, kwargs...) y, classes_seen = fix_y(y) (X, y, var_grouping, classes_seen, w) end diff --git a/src/interfaces/MLJ/clean.jl b/src/interfaces/MLJ/clean.jl index 6d05c77..ba29426 100644 --- a/src/interfaces/MLJ/clean.jl +++ b/src/interfaces/MLJ/clean.jl @@ -88,22 +88,6 @@ function MMI.clean!(m::SymbolicModel) ######################################################################################## ######################################################################################## ######################################################################################## - - if !(isnothing(m.relations) || - m.relations isa Symbol && m.relations in keys(AVAILABLE_RELATIONS) || - m.relations isa Vector{<:AbstractRelation} || - m.relations isa Function - ) - warning *= "relations should be in $(collect(keys(AVAILABLE_RELATIONS))) " * - "or a vector of SoleLogics.AbstractRelation's, " * - "but $(m.relations) " * - "was provided. Defaulting to $(mlj_default_relations_str).\n" - m.relations = nothing - end - - isnothing(m.relations) && (m.relations = mlj_default_relations) - m.relations isa Vector{<:AbstractRelation} && (m.relations = m.relations) - # Patch name: features -> conditions if !isnothing(m.features) if !isnothing(m.conditions) @@ -113,24 +97,10 @@ function MMI.clean!(m::SymbolicModel) m.conditions = m.features m.features = nothing end - - if !(isnothing(m.conditions) || - m.conditions isa Vector{<:Union{SoleData.VarFeature,Base.Callable}} || - m.conditions isa Vector{<:Tuple{Base.Callable,Integer}} || - m.conditions isa Vector{<:Tuple{TestOperator,<:Union{SoleData.VarFeature,Base.Callable}}} || - m.conditions isa Vector{<:SoleData.ScalarMetaCondition} - ) - warning *= "conditions should be either:" * - "a) a vector of features (i.e., callables to be associated to all variables, or SoleData.VarFeature objects);\n" * - "b) a vector of tuples (callable,var_id);\n" * - "c) a vector of tuples (test_operator,features);\n" * - "d) a vector of SoleData.ScalarMetaCondition;\n" * - "but $(m.conditions) " * - "was provided. Defaulting to $(mlj_default_conditions_str).\n" - m.conditions = nothing - end - - isnothing(m.conditions) && (m.conditions = mlj_default_conditions) + + m.relations, _w = SoleData.autorelations(m.relations); warning *= _w + m.conditions, _w = SoleData.autoconditions(m.conditions); warning *= _w + m.downsize, _w = SoleData.autodownsize(m); warning *= _w if !(isnothing(m.initconditions) || m.initconditions isa Symbol && m.initconditions in keys(AVAILABLE_INITCONDITIONS) || @@ -148,20 +118,6 @@ function MMI.clean!(m::SymbolicModel) ######################################################################################## ######################################################################################## - m.downsize = begin - if m.downsize == true - make_downsizing_function(m) - elseif m.downsize == false - identity - elseif m.downsize isa NTuple{N,Integer} where N - make_downsizing_function(m.downsize) - elseif m.downsize isa Function - m.downsize - else - error("Unexpected value for `downsize` encountered: $(m.downsize)") - end - end - if m.rng isa Integer m.rng = Random.MersenneTwister(m.rng) end diff --git a/src/interfaces/MLJ/default-parameters.jl b/src/interfaces/MLJ/default-parameters.jl index e6084d8..a2f85a3 100644 --- a/src/interfaces/MLJ/default-parameters.jl +++ b/src/interfaces/MLJ/default-parameters.jl @@ -1,7 +1,4 @@ using SoleData.DimensionalDatasets -using SoleData.DimensionalDatasets: UniformFullDimensionalLogiset -using SoleData: ScalarOneStepMemoset, AbstractFullMemoset -using SoleData: naturalconditions const ALLOW_GLOBAL_SPLITS = true @@ -21,134 +18,6 @@ sqrt_f(x) = ceil(Int, sqrt(x)) const mlj_mrf_default_n_subfeatures = sqrt_f const mlj_mrf_default_sampling_fraction = 0.7 -AVAILABLE_RELATIONS = OrderedDict{Symbol,Function}([ - :none => (d)->AbstractRelation[], - :IA => (d)->[globalrel, (d == 1 ? SoleLogics.IARelations : (d == 2 ? SoleLogics.IA2DRelations : error("Unexpected dimensionality ($d).")))...], - :IA3 => (d)->[globalrel, (d == 1 ? SoleLogics.IA3Relations : (d == 2 ? SoleLogics.IA32DRelations : error("Unexpected dimensionality ($d).")))...], - :IA7 => (d)->[globalrel, (d == 1 ? SoleLogics.IA7Relations : (d == 2 ? SoleLogics.IA72DRelations : error("Unexpected dimensionality ($d).")))...], - :RCC5 => (d)->[globalrel, SoleLogics.RCC5Relations...], - :RCC8 => (d)->[globalrel, SoleLogics.RCC8Relations...], -]) - -mlj_default_relations = nothing - -mlj_default_relations_str = "either no relation (adimensional data), " * - "IA7 interval relations (1- and 2-dimensional data)." - # , or RCC5 relations " * - # "(2-dimensional data)." - -function defaultrelations(dataset, relations) - # @show typeof(dataset) - if dataset isa Union{ - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset}} where {W,U,FT,FR,L,N}, - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset,<:AbstractFullMemoset}} where {W,U,FT,FR,L,N}, - } - if relations == mlj_default_relations - MDT.relations(dataset) - else - error("Unexpected dataset type: $(typeof(dataset)).") - end - else - symb = begin - if relations isa Symbol - relations - elseif dimensionality(dataset) == 0 - :none - elseif dimensionality(dataset) == 1 - :IA7 - elseif dimensionality(dataset) == 2 - :IA7 - # :RCC8 - else - error("Cannot infer relation set for dimensionality $(repr(dimensionality(dataset))). " * - "Dimensionality should be 0, 1 or 2.") - end - end - - d = dimensionality(dataset) - if d == 0 - AVAILABLE_RELATIONS[:none](d) - else - AVAILABLE_RELATIONS[symb](d) - end - end -end - -# Infer relation set from model.relations parameter and the (unimodal) dataset. -function readrelations(model, dataset) - if model.relations == mlj_default_relations || model.relations isa Symbol - defaultrelations(dataset, model.relations) - else - if dataset isa Union{ - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset}} where {W,U,FT,FR,L,N}, - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset,<:AbstractFullMemoset}} where {W,U,FT,FR,L,N}, - } - rels = model.relations(dataset) - @assert issubset(rels, MDT.relations(dataset)) "Could not find " * - "specified relations $(SoleLogics.displaysyntaxvector(rels)) in " * - "logiset relations $(SoleLogics.displaysyntaxvector(MDT.relations(dataset)))." - rels - else - model.relations(dataset) - end - end -end - - -mlj_default_conditions = nothing - -mlj_default_conditions_str = "scalar conditions (test operators ≥ and <) " * - "on either minimum and maximum feature functions (if dimensional data is provided), " * - "or the features of the logiset, if one is provided." - -function defaultconditions(dataset) - if dataset isa Union{ - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset}} where {W,U,FT,FR,L,N}, - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset,<:AbstractFullMemoset}} where {W,U,FT,FR,L,N}, - } - MDT.metaconditions(dataset) - elseif dataset isa UniformFullDimensionalLogiset - vcat([ - [ - ScalarMetaCondition(feature, ≥), - (all(i_instance->SoleData.nworlds(frame(dataset, i_instance)) == 1, 1:ninstances(dataset)) ? - [] : - [ScalarMetaCondition(feature, <)] - )... - ] - for feature in features(dataset)]...) - else - if all(i_instance->SoleData.nworlds(frame(dataset, i_instance)) == 1, 1:ninstances(dataset)) - [identity] - else - [minimum, maximum] - end - end -end - -function readconditions(model, dataset) - conditions = begin - if model.conditions == mlj_default_conditions - defaultconditions(dataset) - else - model.conditions - end - end - - if dataset isa Union{ - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset}} where {W,U,FT,FR,L,N}, - SupportedLogiset{W,U,FT,FR,L,N,<:Tuple{<:ScalarOneStepMemoset,<:AbstractFullMemoset}} where {W,U,FT,FR,L,N}, - } - @assert issubset(conditions, MDT.metaconditions(dataset)) "Could not find " * - "specified conditions $(SoleLogics.displaysyntaxvector(conditions)) in " * - "logiset metaconditions $(SoleLogics.displaysyntaxvector(MDT.metaconditions(dataset)))." - conditions - else - # @show typeof(dataset) - naturalconditions(dataset, conditions, model.featvaltype) - end -end - mlj_default_initconditions = nothing mlj_default_initconditions_str = "" * diff --git a/src/interfaces/MLJ/docstrings.jl b/src/interfaces/MLJ/docstrings.jl index 8bf90d7..b4350fa 100644 --- a/src/interfaces/MLJ/docstrings.jl +++ b/src/interfaces/MLJ/docstrings.jl @@ -97,7 +97,8 @@ $(forest_hyperparams_str) $(n_subfeatures_str) -- `feature=[minimum, maximum]` Feature functions to be used by the tree to mine scalar conditions (e.g., `minimum[V2] ≥ 10`) +- `features` Feature functions to be used by the tree to mine scalar conditions (e.g., `minimum[V2] ≥ 10`). + This hyper-parameter defaults to $(SoleData.mlj_default_conditions_str) - `featvaltype=Float64` Output type for feature functions, when it cannot be inferred (e.g., with custom feature functions provided). @@ -107,12 +108,12 @@ $(n_subfeatures_str) :RCC5 and :RCC8 are [Region Connection Calculus algebras](https://en.wikipedia.org/wiki/Region_connection_calculus) with 5 and 8 topological operators, respectively. Relations from :IA, :IA3, :IA7, capture directional aspects of the relative arrangement of two intervals in time (or rectangles in a 2D space), while relations from :RCC5 and :RCC8 only capture topological aspects and are therefore rotation and flip-invariant. - This hyper-parameter defaults to $(mlj_default_relations_str). + This hyper-parameter defaults to $(SoleData.mlj_default_relations_str) - `initconditions=nothing` initial conditions for evaluating modal decisions at the root; it can be a symbol in [:start_with_global, :start_at_center]. :start_with_global forces the first decision to be a *global* decision (e.g., `⟨G⟩ (minimum[V2] ≥ 10)`, which translates to "there exists a region where the minimum of variable 2 is higher than 10"). :start_at_center forces the first decision to be evaluated on the smallest central world, that is, the central value of a time-series, or the central pixel of an image. - This hyper-parameter defaults to $(mlj_default_initconditions_str). + This hyper-parameter defaults to ...UPDATE THIS! - `downsize=true` Whether to perform automatic downsizing, by means of moving average. In fact, this algorithm has high complexity (both time and space), and can only handle small time-series (< 100 points) & small images (< 10 x 10 pixels). @@ -321,7 +322,7 @@ ModalRandomForest # See also # [DecisionTree.jl](https://github.com/JuliaAI/DecisionTree.jl) and # the unwrapped model type -# [MLJDecisionTreeInterface.DecisionTree.DecisionTreeRegressor](@ref). +# [MLJDecisionTreeInterface.DecisionTree.DecisionTreeRegressor`](@ref). # """ # DecisionTreeRegressor @@ -339,4 +340,4 @@ ModalRandomForest # See also # [DecisionTree.jl](https://github.com/JuliaAI/DecisionTree.jl) and # the unwrapped model type -# [MLJDecisionTreeInterface.DecisionTree.RandomForestRegressor](@ref). +# [MLJDecisionTreeInterface.DecisionTree.RandomForestRegressor`](@ref). diff --git a/src/interfaces/MLJ/downsize.jl b/src/interfaces/MLJ/downsize.jl index c60bf7d..4285f66 100644 --- a/src/interfaces/MLJ/downsize.jl +++ b/src/interfaces/MLJ/downsize.jl @@ -1,130 +1,10 @@ -using StatsBase -using StatsBase: mean -using SoleBase: movingwindow -using SoleData: AbstractDimensionalDataset +# using SoleData: make_downsizing_function +import SoleData: make_downsizing_function -DOWNSIZE_MSG = "If this process gets killed, please downsize your dataset beforehand." - -function make_downsizing_function(channelsize::NTuple) - return function downsize(instance) - return moving_average(instance, channelsize) - end -end function make_downsizing_function(::TreeModel) - function downsize(instance) - channelsize = MultiData.instance_channelsize(instance) - nvariables = MultiData.instance_nvariables(instance) - channelndims = length(channelsize) - if channelndims == 1 - n_points = channelsize[1] - if nvariables > 30 && n_points > 100 - # @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, 100) - elseif n_points > 150 - # @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, 150) - end - elseif channelndims == 2 - if nvariables > 30 && prod(channelsize) > prod((7,7),) - new_channelsize = min.(channelsize, (7,7)) - # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, new_channelsize) - elseif prod(channelsize) > prod((10,10),) - new_channelsize = min.(channelsize, (10,10)) - # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, new_channelsize) - end - end - instance - end + make_downsizing_function(Val(1)) end - function make_downsizing_function(::ForestModel) - function downsize(instance) - channelsize = MultiData.instance_channelsize(instance) - nvariables = MultiData.instance_nvariables(instance) - channelndims = length(channelsize) - if channelndims == 1 - n_points = channelsize[1] - if nvariables > 30 && n_points > 100 - # @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, 100) - elseif n_points > 150 - # @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, 150) - end - elseif channelndims == 2 - if nvariables > 30 && prod(channelsize) > prod((4,4),) - new_channelsize = min.(channelsize, (4,4)) - # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, new_channelsize) - elseif prod(channelsize) > prod((7,7),) - new_channelsize = min.(channelsize, (7,7)) - # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG" - instance = moving_average(instance, new_channelsize) - end - end - instance - end -end - -# TODO move to MultiData/SoleData - -_mean(::Type{T}, vals::AbstractArray{T}) where {T<:Number} = StatsBase.mean(vals) -_mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:AbstractFloat,T2<:Integer} = T1(StatsBase.mean(vals)) -_mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:Integer,T2<:AbstractFloat} = round(T1, StatsBase.mean(vals)) - -# # 1D -# function moving_average( -# instance::AbstractArray{T,1}; -# kwargs... -# ) where {T<:Union{Nothing,Number}} -# npoints = length(instance) -# return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; kwargs...)] -# end - -# # 1D -# function moving_average( -# instance::AbstractArray{T,1}, -# nwindows::Integer, -# relative_overlap::AbstractFloat = .5, -# ) where {T<:Union{Nothing,Number}} -# npoints = length(instance) -# return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)] -# end - -# 1D-instance -function moving_average( - instance::AbstractArray{T,2}, - nwindows::Union{Integer,Tuple{Integer}}, - relative_overlap::AbstractFloat = .5, -) where {T<:Union{Nothing,Number}} - nwindows = nwindows isa Tuple{<:Integer} ? nwindows[1] : nwindows - npoints, n_variables = size(instance) - new_instance = similar(instance, (nwindows, n_variables)) - for i_variable in 1:n_variables - new_instance[:, i_variable] .= [_mean(T, instance[idxs, i_variable]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)] - end - return new_instance -end - -# 2D-instance -function moving_average( - instance::AbstractArray{T,3}, - new_channelsize::Tuple{Integer,Integer}, - relative_overlap::AbstractFloat = .5, -) where {T<:Union{Nothing,Number}} - n_instance, n_Y, n_variables = size(instance) - windows_1 = movingwindow(n_instance; nwindows = new_channelsize[1], relative_overlap = relative_overlap) - windows_2 = movingwindow(n_Y; nwindows = new_channelsize[2], relative_overlap = relative_overlap) - new_instance = similar(instance, (new_channelsize..., n_variables)) - for i_variable in 1:n_variables - new_instance[:, :, i_variable] .= [_mean(T, instance[idxs1, idxs2, i_variable]) for idxs1 in windows_1, idxs2 in windows_2] - end - return new_instance -end - -function moving_average(dataset::AbstractDimensionalDataset, args...; kwargs...) - return map(instance->moving_average(instance, args...; kwargs...), eachinstance(dataset)) + make_downsizing_function(Val(2)) end diff --git a/src/interfaces/MLJ/sanity-checks.jl b/src/interfaces/MLJ/sanity-checks.jl deleted file mode 100644 index 2588f81..0000000 --- a/src/interfaces/MLJ/sanity-checks.jl +++ /dev/null @@ -1,22 +0,0 @@ - -# if model.check_conditions == true -# check_conditions(model.conditions) -# end -# function check_conditions(conditions) -# if isnothing(conditions) -# return -# end -# # Check that feature extraction functions are scalar -# wrong_conditions = filter((f)->begin -# !all( -# (ch)->!(f isa Base.Callable) || -# (ret = f(ch); isa(ret, Real) && typeof(ret) == eltype(ch)), -# [collect(1:10), collect(1.:10.)] -# ) -# end, conditions) -# @assert length(wrong_conditions) == 0 "When specifying feature extraction functions " * -# "for inferring `conditions`, please specify " * -# "scalar functions accepting an object of type `AbstractArray{T}` " * -# "and returning an object of type `T`, with `T<:Real`. " * -# "Instead, got wrong feature functions: $(wrong_conditions)." -# end diff --git a/src/interfaces/MLJ/wrapdataset.jl b/src/interfaces/MLJ/wrapdataset.jl index f73346c..6cd8753 100644 --- a/src/interfaces/MLJ/wrapdataset.jl +++ b/src/interfaces/MLJ/wrapdataset.jl @@ -5,15 +5,153 @@ using SoleData: AbstractModalLogiset, SupportedLogiset using MultiData using MultiData: dataframe2dimensional -# UNI -# AbstractArray -> scalarlogiset -> supportedlogiset -# SupportedLogiset -> supportedlogiset -# AbstractModalLogiset -> supportedlogiset - -# MULTI -# SoleData.MultiDataset -> multilogiset -# AbstractDataFrame -> naturalgrouping -> multilogiset -# MultiLogiset -> multilogiset +# function wrapdataset( +# X, +# model, +# force_var_grouping::Union{Nothing,AbstractVector{<:AbstractVector}} = nothing; +# passive_mode = false +# ) +# if X isa MultiLogiset +# if !isnothing(force_var_grouping) +# @warn "Ignoring var_grouping $(force_var_grouping) (a MultiLogiset was provided)." +# end +# multimodal_X, var_grouping = X, nothing +# return multimodal_X, var_grouping +# end + +# # Vector of instance values +# # Matrix instance x variable -> Matrix variable x instance +# if X isa AbstractVector +# X = collect(reshape(X, 1, length(X))) +# elseif X isa AbstractMatrix +# X = collect(X') +# end + +# if X isa AbstractArray # Cube +# if !(X isa Union{AbstractVector,AbstractMatrix}) +# @warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " * +# "This will be interpreted as a dataset of $(size(X)[end]) instances, " * +# "$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])." +# # "datasets ($(typeof(X)) encountered)" +# end + +# X = eachslice(X; dims=ndims(X)) +# end + +# X = begin +# if X isa AbstractDimensionalDataset +# X = model.downsize.(eachinstance(X)) + +# if !passive_mode +# @info "Precomputing logiset..." +# metaconditions = readconditions(model, X) +# features = unique(SoleData.feature.(metaconditions)) +# scalarlogiset(X, features; +# use_onestep_memoization = true, +# conditions = metaconditions, +# relations = readrelations(model, X), +# print_progress = (ninstances(X) > 500) +# ) +# else +# MultiData.dimensional2dataframe(X) +# end +# elseif SoleData.hassupports(X) +# X +# elseif X isa AbstractModalLogiset +# SupportedLogiset(X; +# use_onestep_memoization = true, +# conditions = readconditions(model, X), +# relations = readrelations(model, X) +# ) +# elseif X isa AbstractMultiDataset +# X +# elseif Tables.istable(X) +# DataFrame(X) +# else +# X +# end +# end + +# # @show X +# # @show collect.(X) +# # readline() + +# # DataFrame -> MultiDataset + variable grouping (needed for printing) +# X, var_grouping = begin +# if X isa AbstractDataFrame + +# allowedcoltypes = Union{Real,AbstractArray{<:Real,0},AbstractVector{<:Real},AbstractMatrix{<:Real}} +# wrong_columns = filter(((colname,c),)->!(eltype(c) <: allowedcoltypes), collect(zip(names(X), eachcol(X)))) +# @assert length(wrong_columns) == 0 "Invalid columns " * +# "encountered: `$(join(first.(wrong_columns), "`, `", "` and `"))`. $(MDT).jl only allows " * +# "variables that are `Real` and `AbstractArray{<:Real,N}` with N ∈ {0,1,2}. " * +# "Got: `$(join(eltype.(last.(wrong_columns)), "`, `", "` and `"))`" * (length(wrong_columns) > 1 ? ", respectively" : "") * "." + +# var_grouping = begin +# if isnothing(force_var_grouping) +# var_grouping = SoleData.naturalgrouping(X; allow_variable_drop = true) +# if !(length(var_grouping) == 1 && length(var_grouping[1]) == ncol(X)) +# @info "Using variable grouping:\n" * +# # join(map(((i_mod,variables),)->"[$i_mod] -> [$(join(string.(variables), ", "))]", enumerate(var_grouping)), "\n") +# join(map(((i_mod,variables),)->"\t{$i_mod} => $(Tuple(variables))", enumerate(var_grouping)), "\n") +# end +# var_grouping +# else +# @assert force_var_grouping isa AbstractVector{<:AbstractVector} "$(typeof(force_var_grouping))" +# force_var_grouping +# end +# end + +# md = MultiDataset(X, var_grouping) + +# # Downsize +# md = MultiDataset([begin +# mod, varnames = dataframe2dimensional(mod) +# mod = model.downsize.(eachinstance(mod)) +# SoleData.dimensional2dataframe(mod, varnames) +# end for mod in eachmodality(md)]) + +# md, var_grouping +# else +# X, nothing +# end +# end + +# # println(X) +# # println(modality(X, 1)) +# multimodal_X = begin +# if X isa SoleData.AbstractMultiDataset +# if !passive_mode || !SoleData.ismultilogiseed(X) +# @info "Precomputing logiset..." +# MultiLogiset([begin +# _metaconditions = readconditions(model, mod) +# features = unique(SoleData.feature.(_metaconditions)) +# # @show _metaconditions +# # @show features +# scalarlogiset(mod, features; +# use_onestep_memoization = true, +# conditions = _metaconditions, +# relations = readrelations(model, mod), +# print_progress = (ninstances(X) > 500) +# ) +# end for mod in eachmodality(X) +# ]) +# else +# X +# end +# elseif X isa AbstractModalLogiset +# MultiLogiset(X) +# elseif X isa MultiLogiset +# X +# else +# error("Unexpected dataset type: $(typeof(X)). Allowed dataset types are " * +# "AbstractArray, AbstractDataFrame, " * +# "SoleData.AbstractMultiDataset and SoleData.AbstractModalLogiset.") +# end +# end + +# return (multimodal_X, var_grouping) +# end function wrapdataset( X, @@ -21,144 +159,13 @@ function wrapdataset( force_var_grouping::Union{Nothing,AbstractVector{<:AbstractVector}} = nothing; passive_mode = false ) - if X isa MultiLogiset - if !isnothing(force_var_grouping) - @warn "Ignoring var_grouping $(force_var_grouping) (a MultiLogiset was provided)." - end - multimodal_X, var_grouping = X, nothing - return multimodal_X, var_grouping - end - - # Vector of instance values - # Matrix instance x variable -> Matrix variable x instance - if X isa AbstractVector - X = collect(reshape(X, 1, length(X))) - elseif X isa AbstractMatrix - X = collect(X') - end - - if X isa AbstractArray # Cube - if !(X isa Union{AbstractVector,AbstractMatrix}) - @warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " * - "This will be interpreted as a dataset of $(size(X)[end]) instances, " * - "$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])." - # "datasets ($(typeof(X)) encountered)" - end - - X = eachslice(X; dims=ndims(X)) - end - - X = begin - if X isa AbstractDimensionalDataset - X = model.downsize.(eachinstance(X)) - - if !passive_mode - @info "Precomputing logiset..." - metaconditions = readconditions(model, X) - features = unique(SoleData.feature.(metaconditions)) - scalarlogiset(X, features; - use_onestep_memoization = true, - conditions = metaconditions, - relations = readrelations(model, X), - print_progress = (ninstances(X) > 500) - ) - else - MultiData.dimensional2dataframe(X) - end - elseif SoleData.hassupports(X) - X - elseif X isa AbstractModalLogiset - SupportedLogiset(X; - use_onestep_memoization = true, - conditions = readconditions(model, X), - relations = readrelations(model, X) - ) - elseif X isa AbstractMultiDataset - X - elseif Tables.istable(X) - DataFrame(X) - else - X - end - end - - # @show X - # @show collect.(X) - # readline() - - # DataFrame -> MultiDataset + variable grouping (needed for printing) - X, var_grouping = begin - if X isa AbstractDataFrame - - allowedcoltypes = Union{Real,AbstractArray{<:Real,0},AbstractVector{<:Real},AbstractMatrix{<:Real}} - wrong_columns = filter(((colname,c),)->!(eltype(c) <: allowedcoltypes), collect(zip(names(X), eachcol(X)))) - @assert length(wrong_columns) == 0 "Invalid columns " * - "encountered: `$(join(first.(wrong_columns), "`, `", "` and `"))`. $(MDT).jl only allows " * - "variables that are `Real` and `AbstractArray{<:Real,N}` with N ∈ {0,1,2}. " * - "Got: `$(join(eltype.(last.(wrong_columns)), "`, `", "` and `"))`" * (length(wrong_columns) > 1 ? ", respectively" : "") * "." - - var_grouping = begin - if isnothing(force_var_grouping) - var_grouping = SoleData.naturalgrouping(X; allow_variable_drop = true) - if !(length(var_grouping) == 1 && length(var_grouping[1]) == ncol(X)) - @info "Using variable grouping:\n" * - # join(map(((i_mod,variables),)->"[$i_mod] -> [$(join(string.(variables), ", "))]", enumerate(var_grouping)), "\n") - join(map(((i_mod,variables),)->"\t{$i_mod} => $(Tuple(variables))", enumerate(var_grouping)), "\n") - end - var_grouping - else - @assert force_var_grouping isa AbstractVector{<:AbstractVector} "$(typeof(force_var_grouping))" - force_var_grouping - end - end - - md = MultiDataset(X, var_grouping) - - # Downsize - md = MultiDataset([begin - mod, varnames = dataframe2dimensional(mod) - mod = model.downsize.(eachinstance(mod)) - SoleData.dimensional2dataframe(mod, varnames) - end for mod in eachmodality(md)]) - - md, var_grouping - else - X, nothing - end - end - - # println(X) - # println(modality(X, 1)) - multimodal_X = begin - if X isa SoleData.AbstractMultiDataset - if !passive_mode || !SoleData.ismultilogiseed(X) - @info "Precomputing logiset..." - MultiLogiset([begin - _metaconditions = readconditions(model, mod) - features = unique(SoleData.feature.(_metaconditions)) - # @show _metaconditions - # @show features - scalarlogiset(mod, features; - use_onestep_memoization = true, - conditions = _metaconditions, - relations = readrelations(model, mod), - print_progress = (ninstances(X) > 500) - ) - end for mod in eachmodality(X) - ]) - else - X - end - elseif X isa AbstractModalLogiset - MultiLogiset(X) - elseif X isa MultiLogiset - X - else - error("Unexpected dataset type: $(typeof(X)). Allowed dataset types are " * - "AbstractArray, AbstractDataFrame, " * - "SoleData.AbstractMultiDataset and SoleData.AbstractModalLogiset.") - end - end - - return (multimodal_X, var_grouping) -end + SoleData.autologiset( + X; + force_var_grouping = force_var_grouping, + downsize = model.downsize, + conditions = model.conditions, + featvaltype = model.featvaltype, + relations = model.relations, + passive_mode = passive_mode, + ) +end \ No newline at end of file diff --git a/src/interpret-onestep-decisions.jl b/src/interpret-onestep-decisions.jl index aeec43c..ffc6034 100644 --- a/src/interpret-onestep-decisions.jl +++ b/src/interpret-onestep-decisions.jl @@ -129,24 +129,25 @@ Base.@propagate_inbounds @resumable function generate_decisions( features_inds::AbstractVector, grouped_featsaggrsnops::AbstractVector{<:AbstractDict{<:Aggregator,<:AbstractVector{<:ScalarMetaCondition}}}, grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}, + fixnans::Bool, ) where {W<:AbstractWorld,U} # Propositional splits if allow_propositional_decisions - for decision in generate_propositional_decisions(X, i_instances, Sf, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs) + for decision in generate_propositional_decisions(X, i_instances, Sf, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs; fixnans=fixnans) # @logmsg LogDebug " Testing decision: $(displaydecision(decision))" @yield decision end end # Global splits if allow_global_decisions - for decision in generate_global_decisions(X, i_instances, Sf, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs) + for decision in generate_global_decisions(X, i_instances, Sf, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs; fixnans=fixnans) # @logmsg LogDebug " Testing decision: $(displaydecision(decision))" @yield decision end end # Modal splits if allow_modal_decisions - for decision in generate_modal_decisions(X, i_instances, Sf, modal_relations_inds, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs) + for decision in generate_modal_decisions(X, i_instances, Sf, modal_relations_inds, features_inds, grouped_featsaggrsnops, grouped_featsnaggrs; fixnans=fixnans) # @logmsg LogDebug " Testing decision: $(displaydecision(decision))" @yield decision end @@ -330,7 +331,8 @@ Base.@propagate_inbounds @resumable function generate_propositional_decisions( Sf::AbstractVector{<:AbstractWorlds{W}}, features_inds::AbstractVector, grouped_featsaggrsnops::AbstractVector{<:AbstractDict{<:Aggregator,<:AbstractVector{<:ScalarMetaCondition}}}, - grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}, + grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}; + fixnans::Bool=false, ) where {W<:AbstractWorld,U,FT<:AbstractFeature,N,FR<:FullDimensionalFrame{N,W}} relation = identityrel _ninstances = length(i_instances) @@ -398,7 +400,8 @@ Base.@propagate_inbounds @resumable function generate_modal_decisions( modal_relations_inds::AbstractVector, features_inds::AbstractVector, grouped_featsaggrsnops::AbstractVector{<:AbstractDict{<:Aggregator,<:AbstractVector{<:ScalarMetaCondition}}}, - grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}, + grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}; + fixnans::Bool=false, ) where {W<:AbstractWorld,U,FT<:AbstractFeature,N,FR<:FullDimensionalFrame{N,W}} _ninstances = length(i_instances) @@ -441,6 +444,7 @@ Base.@propagate_inbounds @resumable function generate_modal_decisions( if true # _featchannel = featchannel(base(X), i_instance, i_feature) # featchannel_onestep_aggregation(X, _featchannel, i_instance, w, relation, feature(metacondition), aggregator) + # featchannel_onestep_aggregation(X, _featchannel, i_instance, w, relation, metacondition, i_metacond, i_relation; fixnans=fixnans) featchannel_onestep_aggregation(X, _featchannel, i_instance, w, relation, metacondition, i_metacond, i_relation) # onestep_aggregation(X, i_instance, w, relation, feature, aggregator, i_metacond, i_relation) # elseif X isa UniformFullDimensionalLogiset @@ -484,7 +488,8 @@ Base.@propagate_inbounds @resumable function generate_global_decisions( Sf::AbstractVector{<:AbstractWorlds{W}}, features_inds::AbstractVector, grouped_featsaggrsnops::AbstractVector{<:AbstractDict{<:Aggregator,<:AbstractVector{<:ScalarMetaCondition}}}, - grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}, + grouped_featsnaggrs::AbstractVector{<:AbstractVector{Tuple{<:Integer,<:Aggregator}}}; + fixnans::Bool=false, ) where {W<:AbstractWorld,U,FT<:AbstractFeature,N,FR<:FullDimensionalFrame{N,W}} relation = globalrel _ninstances = length(i_instances) @@ -531,7 +536,8 @@ Base.@propagate_inbounds @resumable function generate_global_decisions( gamma = begin if true # _featchannel = featchannel(base(X), i_instance, i_feature) - featchannel_onestep_aggregation(X, _featchannel, i_instance, SoleLogics.emptyworld(frame(X, i_instance)), relation, metacondition, i_metacond) + # featchannel_onestep_aggregation(X, _featchannel, i_instance, SoleLogics.emptyworld(frame(X, i_instance)), relation, metacondition, i_metacond; fixnans=fixnans) + featchannel_onestep_aggregation(X, _featchannel, i_instance, SoleLogics.emptyworld(frame(X, i_instance)), relation, metacondition, i_metacond) # onestep_aggregation(X, i_instance, dummyworldTODO, relation, feature, aggregator, i_metacond) # elseif X isa UniformFullDimensionalLogiset # onestep_aggregation(X, i_instance, dummyworldTODO, relation, feature, aggregator, i_metacond)