Skip to content

Commit

Permalink
fix: renable to_csv and to_json related tests
Browse files Browse the repository at this point in the history
  • Loading branch information
chelsea-lin committed Mar 19, 2024
1 parent b519197 commit 405ca71
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 28 deletions.
42 changes: 22 additions & 20 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
pd.testing.assert_series_equal(actual, expected)


@pytest.mark.skip(reason="Disable to unblock kokoro tests")
@pytest.mark.parametrize(
("index"),
[True, False],
Expand All @@ -130,16 +129,13 @@ def test_to_csv_index(
"""Test the `to_csv` API with the `index` parameter."""
scalars_df, scalars_pandas_df = scalars_dfs
index_col = None
if scalars_df.index.name is not None:
path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv"
if index:
index_col = typing.cast(str, scalars_df.index.name)
else:
path = gcs_folder + f"test_default_index_df_to_csv_index_{index}*.csv"
gcs_file_name = f"test_to_csv_index_{index}"
if scalars_df.index.name is not None and index:
index_col = typing.cast(str, scalars_df.index.name)

# TODO(swast): Support "date_format" parameter and make sure our
# DATETIME/TIMESTAMP column export is the same format as pandas by default.
scalars_df.to_csv(path, index=index)
scalars_df.to_csv(f"{gcs_folder}{gcs_file_name}*.csv", index=index)

# Pandas dataframes dtypes from read_csv are not fully compatible with
# BigQuery-backed dataframes, so manually convert the dtypes specifically
Expand All @@ -149,8 +145,10 @@ def test_to_csv_index(
dtype.pop("rowindex")
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
dtype.pop("bytes_col")

# Works around the known issue: https://github.com/fsspec/gcsfs/issues/616
gcs_df = pd.read_csv(
path,
f"{gcs_folder}{gcs_file_name}000000000000.csv",
dtype=dtype,
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
index_col=index_col,
Expand All @@ -164,7 +162,6 @@ def test_to_csv_index(
pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df)


@pytest.mark.skip(reason="Disable to unblock kokoro tests")
def test_to_csv_tabs(
scalars_dfs: Tuple[bigframes.dataframe.DataFrame, pd.DataFrame],
gcs_folder: str,
Expand All @@ -174,11 +171,11 @@ def test_to_csv_tabs(
"""Test the `to_csv` API with the `sep` parameter."""
scalars_df, scalars_pandas_df = scalars_dfs
index_col = typing.cast(str, scalars_df.index.name)
path = gcs_folder + "test_to_csv_tabs*.csv"
gcs_file_name = "test_to_csv_tabs"

# TODO(swast): Support "date_format" parameter and make sure our
# DATETIME/TIMESTAMP column export is the same format as pandas by default.
scalars_df.to_csv(path, sep="\t", index=True)
scalars_df.to_csv(f"{gcs_folder}{gcs_file_name}*.csv", sep="\t", index=True)

# Pandas dataframes dtypes from read_csv are not fully compatible with
# BigQuery-backed dataframes, so manually convert the dtypes specifically
Expand All @@ -188,8 +185,10 @@ def test_to_csv_tabs(
dtype.pop("rowindex")
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
dtype.pop("bytes_col")

# Works around the known issue: https://github.com/fsspec/gcsfs/issues/616
gcs_df = pd.read_csv(
path,
f"{gcs_folder}{gcs_file_name}000000000000.csv",
sep="\t",
dtype=dtype,
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
Expand Down Expand Up @@ -415,7 +414,6 @@ def test_to_json_index_invalid_lines(
scalars_df.to_json(path, index=index)


@pytest.mark.skip(reason="Disable to unblock kokoro tests")
@pytest.mark.parametrize(
("index"),
[True, False],
Expand All @@ -427,15 +425,19 @@ def test_to_json_index_records_orient(
):
"""Test the `to_json` API with the `index` parameter."""
scalars_df, scalars_pandas_df = scalars_dfs
if scalars_df.index.name is not None:
path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl"
else:
path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl"
gcs_file_name = f"test_to_json_index_records_orient_{index}"

""" Test the `to_json` API with `orient` is `records` and `lines` is True"""
scalars_df.to_json(path, index=index, orient="records", lines=True)
scalars_df.to_json(
f"{gcs_folder}{gcs_file_name}*.jsonl", index=index, orient="records", lines=True
)

gcs_df = pd.read_json(path, lines=True, convert_dates=["datetime_col"])
# Works around the known issue: https://github.com/fsspec/gcsfs/issues/616
gcs_df = pd.read_json(
f"{gcs_folder}{gcs_file_name}000000000000.jsonl",
lines=True,
convert_dates=["datetime_col"],
)
convert_pandas_dtypes(gcs_df, bytes_col=True)
if index and scalars_df.index.name is not None:
gcs_df = gcs_df.set_index(scalars_df.index.name)
Expand Down
20 changes: 12 additions & 8 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,11 +2390,13 @@ def test_to_frame(scalars_dfs):
assert_pandas_df_equal(bf_result, pd_result)


@pytest.mark.skip(reason="Disable to unblock kokoro tests")
def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
path = gcs_folder + "test_series_to_json*.jsonl"
scalars_df_index["int64_col"].to_json(path, lines=True, orient="records")
gcs_df = pd.read_json(path, lines=True)
# Works around the known issue: https://github.com/fsspec/gcsfs/issues/616
gcs_file_name = "test_series_to_json"
scalars_df_index["int64_col"].to_json(
f"{gcs_folder}{gcs_file_name}*.jsonl", lines=True, orient="records"
)
gcs_df = pd.read_json(f"{gcs_folder}{gcs_file_name}000000000000.jsonl", lines=True)

pd.testing.assert_series_equal(
gcs_df["int64_col"].astype(pd.Int64Dtype()),
Expand All @@ -2404,11 +2406,13 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
)


@pytest.mark.skip(reason="Disable to unblock kokoro tests")
def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index):
path = gcs_folder + "test_series_to_csv*.csv"
scalars_df_index["int64_col"].to_csv(path)
gcs_df = pd.read_csv(path)
# Works around the known issue: https://github.com/fsspec/gcsfs/issues/616
gcs_file_name = "test_series_to_csv"
scalars_df_index["int64_col"].to_csv(
f"{gcs_folder}{gcs_file_name}*.csv",
)
gcs_df = pd.read_csv(f"{gcs_folder}{gcs_file_name}000000000000.csv")

pd.testing.assert_series_equal(
gcs_df["int64_col"].astype(pd.Int64Dtype()),
Expand Down

0 comments on commit 405ca71

Please sign in to comment.