Skip to content

Commit

Permalink
add make_missing_mcar and basic imputations (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
khosravipasha authored Nov 7, 2020
1 parent b5e830c commit 08dd8f2
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 6 deletions.
113 changes: 107 additions & 6 deletions src/Utils/misc.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,33 @@
# Miscellaneous utilities.

export issomething, issingle, order_asc, isdisjoint,
pushrand!, get_bit, init_array,
subseteq_fast, similar!,
Var, Lit, var2lit, lit2var, variables, num_variables,
always, never, uniform, logsumexp,
noop, map_values, groupby
export issomething,
issingle,
order_asc,
isdisjoint,
pushrand!,
get_bit,
init_array,
subseteq_fast,
similar!,
Var,
Lit,
var2lit,
lit2var,
variables,
num_variables,
always,
never,
uniform,
logsumexp,
noop,
map_values,
groupby,
make_missing_mcar,
impute

using CUDA: CuArray, CuVector, CuMatrix, CUDA
using DataFrames: DataFrame, missings
using Statistics: mean, median

"Is the argument not `nothing`?"
@inline issomething(x) = !isnothing(x)
Expand Down Expand Up @@ -150,3 +170,84 @@ function groupby(f::Function, list::Union{Vector{E},Set{E}})::Dict{Any,Vector{E}
end
groups
end


##############################
# Imputations & Missing value generation
##############################

"""
make_missing_mcar(d::DataFrame; keep_prob::Float64=0.8)
Returns a copy of dataframe with making some features missing as MCAR, with
`keep_prob` as probability of keeping each feature.
"""
function make_missing_mcar(d::DataFrame; keep_prob::Float64=0.8)
m = missings(eltype(d), num_examples(d), num_features(d))
flag = rand(num_examples(d), num_features(d)) .<= keep_prob
m[flag] .= Matrix(d)[flag]
DataFrame(m)
end;


"""
Return a copy of Imputed values of X (potentiallyl statistics from another DataFrame)
For example, to impute using same DataFrame:
impute(X; method=:median)
If you want to use another DataFrame to provide imputation statistics:
impute(test_x, train_x; method=:mean)
Supported methods are `:median`, `:mean`, `:one`, `:zero`
"""
function impute(X::DataFrame; method=:median)
impute(X, X; method=method)
end
function impute(X::DataFrame, train::DataFrame; method::Symbol=:median)
type = typeintersect(eltype(X), eltype(train))
@assert type !== Union{}

if typeintersect(type, Bool) == Bool
type = Bool
elseif typeintersect(type, AbstractFloat) <: AbstractFloat
type = typeintersect(type, AbstractFloat)
end
@assert type !== Union

if method == :median
impute_function = median
elseif method == :mean
impute_function = mean
elseif method == :one
impute_function = (x -> one(type))
elseif method == :zero
impute_function = (x -> zero(type))
else
throw("Unsupported imputation type $(method)")
end

X_impute = deepcopy(X)
for feature = 1:size(X)[2]
mask_train = ismissing.(train[:, feature])
mask_x = ismissing.(X[:, feature])

cur_impute = impute_function(train[:, feature][.!(mask_train)] )

if type == Bool
X_impute[mask_x, feature] .= Bool(cur_impute .>= 0.5)
else
X_impute[mask_x, feature] .= type(cur_impute)
end
end

# For Bool return BitArray instead
if type == Bool
return DataFrame(BitArray(convert(Matrix, X_impute)))
else
return X_impute
end
end
81 changes: 81 additions & 0 deletions test/Utils/misc_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,84 @@ using DataFrames: DataFrame

end

@testset "Imputations and make_missing" begin

m = [1.1 2.1;
3.1 4.1;
5.1 6.1;
1.0 2.0;
3.0 4.0;
2.0 1.0;
5.0 10.0;
]
df = DataFrame(m)
dfb = DataFrame(BitMatrix([true false true;
true true true;
false true true;
true false true;
false false false;
true false true;
false false true;
]))

all_missing = make_missing_mcar(df; keep_prob=0.0)
@test all(ismissing.(Matrix(all_missing)))

no_missing = make_missing_mcar(df; keep_prob=1.0)
@test all(.!(ismissing.(Matrix(no_missing))))

# Imputations Test
ms_df = make_missing_mcar(df)
ms_dfb = make_missing_mcar(dfb)

function test_no_missing()
@test all(.!(ismissing.(Matrix(imp1))))
@test all(.!(ismissing.(Matrix(imp2))))
@test all(.!(ismissing.(Matrix(imp3))))
@test all(.!(ismissing.(Matrix(imp4))))
@test all(.!(ismissing.(Matrix(imp5))))
end


## Median Imputation
imp1 = impute(ms_df)
imp2 = impute(ms_df, ms_df)
imp3 = impute(ms_df, df)
imp4 = impute(ms_dfb, dfb)
imp5 = impute(ms_dfb)

test_no_missing()

## Mean Imputation
imp1 = impute(ms_df; method=:mean)
imp2 = impute(ms_df, ms_df; method=:mean)
imp3 = impute(ms_df, df; method=:mean)
imp4 = impute(ms_dfb, dfb; method=:mean)
imp5 = impute(ms_dfb; method=:mean)

test_no_missing()

## One imputation
imp1 = impute(ms_df; method=:one)
imp2 = impute(ms_df, ms_df; method=:one)
imp3 = impute(ms_df, df; method=:one)
imp4 = impute(ms_dfb, dfb; method=:one)
imp5 = impute(ms_dfb; method=:one)

test_no_missing()
mask1 = ismissing.(ms_df[:,:])
@test all(Matrix(imp3)[Matrix(mask1)] .== 1.0)

## Zero imputation
imp1 = impute(ms_df; method=:zero)
imp2 = impute(ms_df, ms_df; method=:zero)
imp3 = impute(ms_df, df; method=:zero)
imp4 = impute(ms_dfb, dfb; method=:zero)
imp5 = impute(ms_dfb; method=:zero)

test_no_missing()

mask1 = ismissing.(ms_df[:,:])
@test all(Matrix(imp3)[Matrix(mask1)] .== 0.0)

end

0 comments on commit 08dd8f2

Please sign in to comment.