Skip to content

Commit

Permalink
fix: converting to dataframe with out of bounds timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
plamut committed Aug 1, 2020
1 parent 8360487 commit 485fbc2
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 2 deletions.
28 changes: 27 additions & 1 deletion google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import functools
import logging
import operator
import pytz
import warnings

import six
Expand Down Expand Up @@ -1726,7 +1727,32 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
df = record_batch.to_pandas(date_as_object=date_as_object)

# When converting timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
#
# NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
# in pyarrow>=1.0, but the latter is not compatible with Python 2.
if six.PY2:
extra_kwargs = {}
else:
type_to_check = pyarrow.timestamp("us", tz=pytz.UTC)

for column in record_batch:
if column.type == type_to_check:
try:
column.cast("timestamp[ns]")
except pyarrow.lib.ArrowInvalid:
timestamp_as_object = True
break
else:
timestamp_as_object = False

extra_kwargs = {"timestamp_as_object": timestamp_as_object}

df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
return df
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,11 @@
"pandas": ["pandas>=0.17.1"],
# Exclude PyArrow dependency from Windows Python 2.7.
'pyarrow: platform_system != "Windows" or python_version >= "3.4"': [
# Pyarrow >= 1.0 is not compatible with Python 2 anymore.
"pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
# Bad Linux release for 0.14.0.
# https://issues.apache.org/jira/browse/ARROW-5868
"pyarrow>=0.4.1, != 0.14.0"
"pyarrow>=0.4.1, != 0.14.0; python_version<'3.0'",
],
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
"fastparquet": [
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime as dt
import itertools
import logging
import time
Expand Down Expand Up @@ -2271,6 +2272,37 @@ def test_to_dataframe(self):
self.assertEqual(df.name.dtype.name, "object")
self.assertEqual(df.age.dtype.name, "int64")

@pytest.mark.xfail(
six.PY2,
reason=(
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
"with Python 2 anymore."
),
)
@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
from google.cloud.bigquery.schema import SchemaField

schema = [SchemaField("some_timestamp", "TIMESTAMP")]
rows = [
{"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC
{"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)

df = row_iterator.to_dataframe(create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 2) # verify the number of rows
self.assertEqual(list(df.columns), ["some_timestamp"])
self.assertEqual(
list(df["some_timestamp"]),
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
)

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_warning_wo_pyarrow(self):
from google.cloud.bigquery.client import PyarrowMissingWarning
Expand Down

0 comments on commit 485fbc2

Please sign in to comment.