add handling empty columns with np.nan values

co-cddo · Mar 22, 2024 · b670f8b · b670f8b
1 parent b2ab215
commit b670f8b
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 10 deletions.
diff --git a/api/api/domain/data_types.py b/api/api/domain/data_types.py
@@ -97,9 +97,9 @@ def is_date_type(type: str) -> bool:
 def extract_athena_types(df: DataFrame) -> dict:
     types = {}
     for column in df.columns:
-        dtype = str(infer_dtype(df[column], skipna=True))
-        if dtype == 'empty':
+        if df[column].dropna().size == 0:
             continue
+        dtype = str(infer_dtype(df[column], skipna=True))
         try:
             types[column] = PANDAS_TO_ATHENA_CONVERTER[dtype].value
         except KeyError:

diff --git a/api/test/api/application/services/test_dataset_validation.py b/api/test/api/application/services/test_dataset_validation.py
@@ -1,6 +1,7 @@
 import re
 from typing import List
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -468,6 +469,8 @@ def test_return_error_message_when_not_correct_datatypes(self):
                 "col3": [1, 5, True],
                 "col4": [1.5, 2.5, "A"],
                 "col5": ["2021-01-01", "2021-05-01", 1000],
+                "col6": [None, None, None],
+                "col7": [np.nan, np.nan, np.nan]
             }
         )
         schema = Schema(
@@ -503,17 +506,27 @@ def test_return_error_message_when_not_correct_datatypes(self):
                     data_type="date",
                     allow_null=False,
                 ),
+                Column(
+                    name="col6",
+                    partition_index=None,
+                    data_type="string",
+                    allow_null=True,
+                ),
+                Column(
+                    name="col7",
+                    partition_index=None,
+                    data_type="string",
+                    allow_null=True,
+                ),
             ],
         )
 
-        try:
-            dataset_has_correct_data_types(df, schema)
-        except DatasetValidationError as error:
-            assert error.message == [
-                "Column [col2] has an incorrect data type. Expected boolean, received string",
-                "Column [col3] has an incorrect data type. Expected int, received string",
-                "Column [col4] has an incorrect data type. Expected double, received string",
-            ]
+        data_frame, error_list = dataset_has_correct_data_types(df, schema)
+        assert error_list == [
+            "Column [col2] has an incorrect data type. Expected boolean, received string",
+            "Column [col3] has an incorrect data type. Expected int, received string",
+            "Column [col4] has an incorrect data type. Expected bigint, received string",
+        ]
 
     def test_return_error_message_when_dataset_has_illegal_chars_in_partition_columns(
         self,