fix: replace NaNs with None in some backends when loading from pandas…

… dataframe (#9094) ## Description of changes Examples were broken on MySQL backend and Postgres backend when there are null values in a numeric column. Druid, PySpark, RW don't support examples. - Exasol - did not test - Flink - broken - Impala - did not test - MSSQL - broken #9095 - MySQL - fixed - Oracle - did not test - PostgreSQL - fixed - Snowflake - did not test ## Issues closed #8792 --------- Co-authored-by: Chloe He <[email protected]>
ibis-project · May 6, 2024 · f2a7cd9 · f2a7cd9
1 parent b52a77d
commit f2a7cd9
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 0 deletions.
diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py
@@ -11,6 +11,7 @@
 from typing import TYPE_CHECKING, Any
 from urllib.parse import parse_qs, urlparse
 
+import numpy as np
 import pymysql
 import sqlglot as sg
 import sqlglot.expressions as sge
@@ -481,6 +482,9 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
 
             columns = schema.keys()
             df = op.data.to_frame()
+            # nan can not be used with MySQL
+            df = df.replace(np.nan, None)
+
             data = df.itertuples(index=False)
             cols = ", ".join(
                 ident.sql(self.name)

diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py
@@ -11,8 +11,11 @@
 from typing import TYPE_CHECKING, Any, Callable
 from urllib.parse import parse_qs, urlparse
 
+import numpy as np
+import pandas as pd
 import sqlglot as sg
 import sqlglot.expressions as sge
+from pandas.api.types import is_float_dtype
 
 import ibis
 import ibis.common.exceptions as com
@@ -144,6 +147,16 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
 
             columns = schema.keys()
             df = op.data.to_frame()
+            # nan gets compiled into 'NaN'::float which throws errors in non-float columns
+            # In order to hold NaN values, pandas automatically converts integer columns
+            # to float columns if there are NaN values in them. Therefore, we need to convert
+            # them to their original dtypes (that support pd.NA) to figure out which columns
+            # are actually non-float, then fill the NaN values in those columns with None.
+            convert_df = df.convert_dtypes()
+            for col in convert_df.columns:
+                if not is_float_dtype(convert_df[col]):
+                    df[col] = df[col].replace(np.nan, None)
+
             data = df.itertuples(index=False)
             cols = ", ".join(
                 ident.sql(self.dialect)