Skip to content

Commit

Permalink
simplify test
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Jul 16, 2021
1 parent 5c9eb31 commit 34898fd
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 27 deletions.
4 changes: 2 additions & 2 deletions python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class ParquetFile:
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
background I/O thread pool.
coerce_int96_timestamp_unit: str, default None.
coerce_int96_timestamp_unit : str, default None.
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be infered as timestamps
Expand Down Expand Up @@ -1262,7 +1262,7 @@ class ParquetDataset:
use_legacy_dataset=False. If using a filesystem layer that itself
performs readahead (e.g. fsspec's S3FS), disable readahead for best
results.
coerce_int96_timestamp_unit: str, default None.
coerce_int96_timestamp_unit : str, default None.
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be infered as timestamps
Expand Down
33 changes: 8 additions & 25 deletions python/pyarrow/tests/parquet/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.

import sys
import datetime
import io

Expand Down Expand Up @@ -298,9 +297,6 @@ def test_coerce_int96_timestamp_unit(unit):
@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):

if sys.platform in ("win32", "cygwin"):
pytest.skip("Getting datetime.strftime() error on Windows")

def get_table(pq_reader_method, filename, **kwargs):
if pq_reader_method == "ParquetFile":
return pq.ParquetFile(filename, **kwargs).read()
Expand All @@ -313,37 +309,24 @@ def get_table(pq_reader_method, filename, **kwargs):
datetime.datetime(2000, 1, 1),
datetime.datetime(3000, 1, 1)
]
oob_dts_str = [
x.strftime("%Y-%m-%s %H:%M:%S.%f")
for x in oob_dts
]
df = pd.DataFrame({"a": oob_dts})
a_df = pa.Table.from_pandas(df)
table = pa.table(df)

filename = tempdir / "test_round_trip_overflow.parquet"
pq.write_table(a_df, filename, use_deprecated_int96_timestamps=True,
pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
version="1.0")

# with the default resolution of ns, we get wrong values for INT96
# that are out of bounds for nanosecond range
tab_error = get_table(pq_reader_method, filename)
df_error = tab_error.to_pandas(timestamp_as_object=True)
out_error = [
x.strftime("%Y-%m-%s %H:%M:%S.%f")
for x in df_error["a"].tolist()
]

assert out_error != oob_dts_str
assert tab_error["a"].to_pylist() != oob_dts

# avoid this overflow by specifying the resolution to use for INT96 values
tab_correct = get_table(
pq_reader_method,
filename,
coerce_int96_timestamp_unit="s"
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
)
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
out_correct = [
x.strftime("%Y-%m-%s %H:%M:%S.%f")
for x in df_correct.a.tolist()
]
assert out_correct == oob_dts_str
tm.assert_frame_equal(df, df_correct)


def test_timestamp_restore_timezone():
Expand Down

0 comments on commit 34898fd

Please sign in to comment.