Skip to content

Commit

Permalink
add handling empty columns with np.nan values
Browse files Browse the repository at this point in the history
  • Loading branch information
Jarek-Rolski committed Mar 22, 2024
1 parent b2ab215 commit b670f8b
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 10 deletions.
4 changes: 2 additions & 2 deletions api/api/domain/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def is_date_type(type: str) -> bool:
def extract_athena_types(df: DataFrame) -> dict:
types = {}
for column in df.columns:
dtype = str(infer_dtype(df[column], skipna=True))
if dtype == 'empty':
if df[column].dropna().size == 0:
continue
dtype = str(infer_dtype(df[column], skipna=True))
try:
types[column] = PANDAS_TO_ATHENA_CONVERTER[dtype].value
except KeyError:
Expand Down
29 changes: 21 additions & 8 deletions api/test/api/application/services/test_dataset_validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import List

import numpy as np
import pandas as pd
import pytest

Expand Down Expand Up @@ -468,6 +469,8 @@ def test_return_error_message_when_not_correct_datatypes(self):
"col3": [1, 5, True],
"col4": [1.5, 2.5, "A"],
"col5": ["2021-01-01", "2021-05-01", 1000],
"col6": [None, None, None],
"col7": [np.nan, np.nan, np.nan]
}
)
schema = Schema(
Expand Down Expand Up @@ -503,17 +506,27 @@ def test_return_error_message_when_not_correct_datatypes(self):
data_type="date",
allow_null=False,
),
Column(
name="col6",
partition_index=None,
data_type="string",
allow_null=True,
),
Column(
name="col7",
partition_index=None,
data_type="string",
allow_null=True,
),
],
)

try:
dataset_has_correct_data_types(df, schema)
except DatasetValidationError as error:
assert error.message == [
"Column [col2] has an incorrect data type. Expected boolean, received string",
"Column [col3] has an incorrect data type. Expected int, received string",
"Column [col4] has an incorrect data type. Expected double, received string",
]
data_frame, error_list = dataset_has_correct_data_types(df, schema)
assert error_list == [
"Column [col2] has an incorrect data type. Expected boolean, received string",
"Column [col3] has an incorrect data type. Expected int, received string",
"Column [col4] has an incorrect data type. Expected bigint, received string",
]

def test_return_error_message_when_dataset_has_illegal_chars_in_partition_columns(
self,
Expand Down

0 comments on commit b670f8b

Please sign in to comment.