Skip to content

Commit

Permalink
Remove pyarrow as hard dep (#523)
Browse files Browse the repository at this point in the history
* Remove pyarrow as hard dep

* return something

* fixup
  • Loading branch information
yannikschaelte authored and EmadAlamoudi committed Jun 30, 2022
1 parent 5889d20 commit 6459335
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 18 deletions.
54 changes: 38 additions & 16 deletions pyabc/storage/dataframe_bytes_storage.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import logging
import warnings
from io import BytesIO, StringIO

import numpy as np
Expand All @@ -12,20 +13,21 @@
import pyarrow.parquet as parquet
except ImportError:
pyarrow = parquet = None
logger.warning(
"Can't find pyarrow, thus falling back to csv to store pandas data.",
)


class DataFrameLoadException(Exception):
"""Exception to indicate DataFrame loading failure."""

pass


def df_to_bytes_csv(df: pd.DataFrame) -> bytes:
"""Pandas DataFrame to csv."""
return df.to_csv(quoting=csv.QUOTE_NONNUMERIC).encode()


def df_from_bytes_csv(bytes_: bytes) -> pd.DataFrame:
"""Pandas DataFrame from csv."""
try:
s = StringIO(bytes_.decode())
s.seek(0)
Expand All @@ -37,33 +39,38 @@ def df_from_bytes_csv(bytes_: bytes) -> pd.DataFrame:
quotechar='"',
)
except UnicodeDecodeError:
raise DataFrameLoadException("Not a DataFrame")
raise DataFrameLoadException("Not a csv DataFrame")


def df_to_bytes_msgpack(df: pd.DataFrame) -> bytes:
"""Pandas DataFrame to msgpack."""
return df.to_msgpack()


def df_from_bytes_msgpack(bytes_: bytes) -> pd.DataFrame:
"""Pandas DataFrame from msgpack."""
try:
df = pd.read_msgpack(BytesIO(bytes_))
except UnicodeDecodeError:
raise DataFrameLoadException("Not a DataFrame")
raise DataFrameLoadException("Not a msgpack DataFrame")
if not isinstance(df, pd.DataFrame):
raise DataFrameLoadException("Not a DataFrame")
raise DataFrameLoadException("Not a msgpack DataFrame")
return df


def df_to_bytes_json(df: pd.DataFrame) -> bytes:
"""Pandas DataFrame to json."""
return df.to_json().encode()


def df_from_bytes_json(bytes_: bytes) -> pd.DataFrame:
"""Pandas DataFrame from json."""
return pd.read_json(bytes_.decode())


def df_to_bytes_parquet(df: pd.DataFrame) -> bytes:
"""
"""Pandas DataFrame to parquet.
pyarrow parquet is the standard conversion method of pandas
DataFrames since pyabc 0.9.14, because msgpack became
deprecated in pandas 0.25.0.
Expand All @@ -76,13 +83,15 @@ def df_to_bytes_parquet(df: pd.DataFrame) -> bytes:


def df_from_bytes_parquet(bytes_: bytes) -> pd.DataFrame:
"""
"""Pandas DataFrame from parquet, also try legacy msgpack.
Since pyabc 0.9.14, pandas DataFrames are converted using
pyarrow parquet. If the conversion to DataFrame fails,
then `df_from_bytes_msgpack_` is tried, which was the formerly
used method. This is in particular useful for databases that
still employ the old format. In case errors occur here, it may
be necessary to use a pandas version prior to 0.25.0.
pyarrow parquet.
If the conversion to DataFrame fails, then `df_from_bytes_msgpack` is
tried, the formerly used method.
This is needed for old databases.
In case errors occur here, it may be necessary to use a pandas version
prior to 0.25.0.
"""
try:
b = BytesIO(bytes_)
Expand All @@ -94,6 +103,7 @@ def df_from_bytes_parquet(bytes_: bytes) -> pd.DataFrame:


def df_to_bytes_np_records(df: pd.DataFrame) -> bytes:
"""Pandas DataFrame to numpy.recarray."""
b = BytesIO()
rec = df.to_records()
np.save(b, rec, allow_pickle=False)
Expand All @@ -102,18 +112,24 @@ def df_to_bytes_np_records(df: pd.DataFrame) -> bytes:


def df_from_bytes_np_records(bytes_: bytes) -> pd.DataFrame:
"""Pandas DataFrame from numpy.recarray."""
b = BytesIO(bytes_)
rec = np.load(b)
df = pd.DataFrame.from_records(rec, index="index")
return df


def df_to_bytes(df: pd.DataFrame) -> bytes:
"""Write dataframe to bytes.
"""Write Pandas DataFrame to bytes.
Use pyarrow parquet if available, otherwise csv.
Use pyarrow PARQUET if available, otherwise csv.
"""
if pyarrow is None:
warnings.warn(
"Can't find pyarrow, falling back to less efficient csv "
"to store pandas DataFrames.\n"
"Install e.g. via `pip install pyabc[pyarrow]`",
)
return df_to_bytes_csv(df)
return df_to_bytes_parquet(df)

Expand All @@ -124,5 +140,11 @@ def df_from_bytes(bytes_: bytes) -> pd.DataFrame:
If pyarrow is not available, try csv.
"""
if pyarrow is None:
return df_from_bytes_csv(bytes_)
try:
return df_from_bytes_csv(bytes_)
except DataFrameLoadException:
raise DataFrameLoadException(
"Not a csv DataFrame. An installation of pyarrow "
"may be required, e.g. via `pip install pyabc[pyarrow]`"
)
return df_from_bytes_parquet(bytes_)
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ install_requires =
distributed >= 2021.10.0
matplotlib >= 3.3.0
sqlalchemy >= 1.3.18
pyarrow >= 1.0.0
jabbar >= 0.0.10
gitpython >= 3.1.7

Expand All @@ -79,6 +78,8 @@ webserver =
flask_bootstrap >= 3.3.7.1
flask >= 1.1.2
bokeh >= 2.1.1
pyarrow =
pyarrow >= 6.0.0
R =
rpy2 >= 3.4.4
cffi >= 1.14.5
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ description =
# Unit tests

[testenv:base]
extras = test,R
extras = test,R,pyarrow
passenv = HOME
commands =
# needed by pot
Expand Down

0 comments on commit 6459335

Please sign in to comment.