From 485fbc2266b6ef8d0fbabb6a07908f83d189d60d Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Sat, 1 Aug 2020 18:07:31 +0200 Subject: [PATCH] fix: converting to dataframe with out of bounds timestamps --- google/cloud/bigquery/table.py | 28 +++++++++++++++++++++++++++- setup.py | 4 +++- tests/unit/test_table.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 10b4198d32..e4d861f977 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -21,6 +21,7 @@ import functools import logging import operator +import pytz import warnings import six @@ -1726,7 +1727,32 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) - df = record_batch.to_pandas(date_as_object=date_as_object) + + # When converting timestamp values to nanosecond precision, the result + # can be out of pyarrow bounds. To avoid the error when converting to + # Pandas, we set the timestamp_as_object parameter to True, if necessary. + # + # NOTE: Python 3+ only, as timestamp_as_object parameter is only supported + # in pyarrow>=1.0, but the latter is not compatible with Python 2. + if six.PY2: + extra_kwargs = {} + else: + type_to_check = pyarrow.timestamp("us", tz=pytz.UTC) + + for column in record_batch: + if column.type == type_to_check: + try: + column.cast("timestamp[ns]") + except pyarrow.lib.ArrowInvalid: + timestamp_as_object = True + break + else: + timestamp_as_object = False + + extra_kwargs = {"timestamp_as_object": timestamp_as_object} + + df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) + for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df diff --git a/setup.py b/setup.py index f391143d34..25ffa7fc95 100644 --- a/setup.py +++ b/setup.py @@ -48,9 +48,11 @@ "pandas": ["pandas>=0.17.1"], # Exclude PyArrow dependency from Windows Python 2.7. 'pyarrow: platform_system != "Windows" or python_version >= "3.4"': [ + # Pyarrow >= 1.0 is not compatible with Python 2 anymore. + "pyarrow>=1.0.0, <2.0dev; python_version>='3.4'", # Bad Linux release for 0.14.0. # https://issues.apache.org/jira/browse/ARROW-5868 - "pyarrow>=0.4.1, != 0.14.0" + "pyarrow>=0.4.1, != 0.14.0; python_version<'3.0'", ], "tqdm": ["tqdm >= 4.0.0, <5.0.0dev"], "fastparquet": [ diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 28575bd430..1b6a7c8e59 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime as dt import itertools import logging import time @@ -2271,6 +2272,37 @@ def test_to_dataframe(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.age.dtype.name, "int64") + @pytest.mark.xfail( + six.PY2, + reason=( + "Requires pyarrow>-1.0 to work, but the latter is not compatible " + "with Python 2 anymore." + ), + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [SchemaField("some_timestamp", "TIMESTAMP")] + rows = [ + {"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC + {"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + df = row_iterator.to_dataframe(create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 2) # verify the number of rows + self.assertEqual(list(df.columns), ["some_timestamp"]) + self.assertEqual( + list(df["some_timestamp"]), + [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_warning_wo_pyarrow(self): from google.cloud.bigquery.client import PyarrowMissingWarning