From ae640222fb3ab1d1baf790a8e38db67ac4d65721 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Wed, 18 Sep 2019 17:05:29 -0700 Subject: [PATCH 1/2] Handle TimestampType separately. --- databricks/koalas/base.py | 10 +++------- databricks/koalas/internal.py | 4 ++-- databricks/koalas/typedef.py | 8 ++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 2b98e74dce..dd5cc33e92 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -26,12 +26,11 @@ from pandas.api.types import is_list_like from pyspark import sql as spark from pyspark.sql import functions as F, Window -from pyspark.sql.types import DoubleType, FloatType, LongType, StringType, TimestampType, \ - to_arrow_type +from pyspark.sql.types import DoubleType, FloatType, LongType, StringType, TimestampType from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas.internal import _InternalFrame -from databricks.koalas.typedef import pandas_wraps +from databricks.koalas.typedef import pandas_wraps, spark_type_to_pandas_dtype from databricks.koalas.utils import align_diff_series, scol_for @@ -219,10 +218,7 @@ def dtype(self): >>> s.rename("a").to_frame().set_index("a").index.dtype dtype(' 0: - pdf = pdf.astype({field.name: to_arrow_type(field.dataType).to_pandas_dtype() + pdf = pdf.astype({field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}) index_columns = self.index_columns diff --git a/databricks/koalas/typedef.py b/databricks/koalas/typedef.py index f177de3870..282e18ec97 100644 --- a/databricks/koalas/typedef.py +++ b/databricks/koalas/typedef.py @@ -145,6 +145,14 @@ def as_spark_type(tpe) -> types.DataType: return _known_types.get(tpe, None) +def spark_type_to_pandas_dtype(spark_type): + """ Return the given Spark DataType to pandas dtype. """ + if isinstance(spark_type, types.TimestampType): + return np.dtype('datetime64[ns]') + else: + return np.dtype(types.to_arrow_type(spark_type).to_pandas_dtype()) + + def as_python_type(spark_tpe): return _py_conversions.get(spark_tpe, None) From 39117cf96328b49b81d3f271d1ed5e4c1bbf501a Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Wed, 18 Sep 2019 17:30:08 -0700 Subject: [PATCH 2/2] Add a test. --- databricks/koalas/tests/test_dataframe.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 2979602977..3b8d605c94 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -14,6 +14,7 @@ # limitations under the License. # +from datetime import date, datetime import inspect import numpy as np @@ -1642,3 +1643,11 @@ def test_transform(self): with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"): kdf.transform(1) + + def test_empty_timestamp(self): + pdf = pd.DataFrame({'t': [datetime(2019, 1, 1, 0, 0, 0), + datetime(2019, 1, 2, 0, 0, 0), + datetime(2019, 1, 3, 0, 0, 0)]}) + kdf = ks.from_pandas(pdf) + self.assert_eq(kdf[kdf['t'] != kdf['t']], pdf[pdf['t'] != pdf['t']]) + self.assert_eq(kdf[kdf['t'] != kdf['t']].dtypes, pdf[pdf['t'] != pdf['t']].dtypes)