Skip to content

Commit

Permalink
Fixed handling of invalud number values to treat as missing values (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
tgaddair authored Jul 9, 2022
1 parent 2961eb9 commit ed7967f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
4 changes: 4 additions & 0 deletions ludwig/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,6 +1288,10 @@ def build_data(
proc_cols = {}
for feature_config in feature_configs:
preprocessing_parameters = training_set_metadata[feature_config[NAME]][PREPROCESSING]

# Need to run this again here as cast_columns may have introduced new missing values
handle_missing_values(input_cols, feature_config, preprocessing_parameters)

get_from_registry(feature_config[TYPE], base_type_registry).add_feature_data(
feature_config,
input_cols,
Expand Down
35 changes: 35 additions & 0 deletions tests/integration_tests/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import os
import random
import string

import numpy as np
import pandas as pd
import pytest

from ludwig.api import LudwigModel
from ludwig.constants import COLUMN, PROC_COLUMN
from ludwig.data.concatenate_datasets import concatenate_df
from tests.integration_tests.utils import (
audio_feature,
binary_feature,
Expand All @@ -14,6 +17,7 @@
image_feature,
init_backend,
LocalTestBackend,
number_feature,
sequence_feature,
)

Expand Down Expand Up @@ -139,3 +143,34 @@ def test_dask_known_divisions(feature_fn, csv_filename, tmpdir):
data_df,
skip_save_processed_input=False,
)


def test_number_feature_wrong_dtype(csv_filename, tmpdir):
"""Tests that a number feature with all string values is treated as having missing values by default."""
data_csv_path = os.path.join(tmpdir, csv_filename)

num_feat = number_feature()
input_features = [num_feat]
output_features = [binary_feature()]
config = {"input_features": input_features, "output_features": output_features}

training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
df = pd.read_csv(training_data_csv_path)

# convert numbers to random strings
def random_string():
letters = string.ascii_lowercase
return "".join(random.choice(letters) for _ in range(10))

df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply(lambda _: random_string())

# run preprocessing
backend = LocalTestBackend()
ludwig_model = LudwigModel(config, backend=backend)
train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df)

concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(), test_ds.to_df(), backend)

# check that train_ds had invalid values replaced with the missing value
assert len(concatenated_df) == len(df)
assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)

0 comments on commit ed7967f

Please sign in to comment.