From f2a7cd9120a3b5b5c29ad33c7b4c218faa929ca9 Mon Sep 17 00:00:00 2001 From: Chloe He Date: Mon, 6 May 2024 06:48:00 -0700 Subject: [PATCH] fix: replace NaNs with None in some backends when loading from pandas dataframe (#9094) ## Description of changes Examples were broken on MySQL backend and Postgres backend when there are null values in a numeric column. Druid, PySpark, RW don't support examples. - Exasol - did not test - Flink - broken - Impala - did not test - MSSQL - broken #9095 - MySQL - fixed - Oracle - did not test - PostgreSQL - fixed - Snowflake - did not test ## Issues closed #8792 --------- Co-authored-by: Chloe He --- ibis/backends/mysql/__init__.py | 4 ++++ ibis/backends/postgres/__init__.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py index f09768054d19..ce0589fa2843 100644 --- a/ibis/backends/mysql/__init__.py +++ b/ibis/backends/mysql/__init__.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any from urllib.parse import parse_qs, urlparse +import numpy as np import pymysql import sqlglot as sg import sqlglot.expressions as sge @@ -481,6 +482,9 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: columns = schema.keys() df = op.data.to_frame() + # nan can not be used with MySQL + df = df.replace(np.nan, None) + data = df.itertuples(index=False) cols = ", ".join( ident.sql(self.name) diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index e7f54864fb48..016b548d4e30 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -11,8 +11,11 @@ from typing import TYPE_CHECKING, Any, Callable from urllib.parse import parse_qs, urlparse +import numpy as np +import pandas as pd import sqlglot as sg import sqlglot.expressions as sge +from pandas.api.types import is_float_dtype import ibis import ibis.common.exceptions as com @@ -144,6 +147,16 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: columns = schema.keys() df = op.data.to_frame() + # nan gets compiled into 'NaN'::float which throws errors in non-float columns + # In order to hold NaN values, pandas automatically converts integer columns + # to float columns if there are NaN values in them. Therefore, we need to convert + # them to their original dtypes (that support pd.NA) to figure out which columns + # are actually non-float, then fill the NaN values in those columns with None. + convert_df = df.convert_dtypes() + for col in convert_df.columns: + if not is_float_dtype(convert_df[col]): + df[col] = df[col].replace(np.nan, None) + data = df.itertuples(index=False) cols = ", ".join( ident.sql(self.dialect)