From 0cd24735cbde38ae2cfca5ddbd399451baf307f4 Mon Sep 17 00:00:00 2001 From: Anjana S Date: Tue, 6 Nov 2018 03:16:10 +0530 Subject: [PATCH] Bumping up min version for pyarrow and fastparquet (#23482) * Bumping up min version for pyarrow --- ci/requirements-optional-conda.txt | 4 +- ci/requirements-optional-pip.txt | 8 +-- ci/travis-27.yaml | 2 +- doc/source/install.rst | 4 +- doc/source/whatsnew/v0.24.0.txt | 6 ++- pandas/io/parquet.py | 78 +++++------------------------- pandas/tests/io/test_parquet.py | 34 ++----------- 7 files changed, 32 insertions(+), 104 deletions(-) diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index c9dc385b879863..8758c8154abca2 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -1,7 +1,7 @@ beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 -fastparquet +fastparquet>=0.1.2 gcsfs html5lib ipython>=5.6.0 @@ -12,7 +12,7 @@ matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 openpyxl -pyarrow>=0.4.1 +pyarrow>=0.7.0 pymysql pytables>=3.4.2 pytest-cov diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 347ea0d9832b04..62f1c555d8544a 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -3,7 +3,7 @@ beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 -fastparquet +fastparquet>=0.1.2 gcsfs html5lib ipython>=5.6.0 @@ -14,9 +14,9 @@ matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 openpyxl -pyarrow>=0.4.1 +pyarrow>=0.7.0 pymysql -tables +pytables>=3.4.2 pytest-cov pytest-xdist s3fs @@ -27,4 +27,4 @@ statsmodels xarray xlrd xlsxwriter -xlwt +xlwt \ No newline at end of file diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 9641a76152d7b0..28bee387a4f4a9 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -22,7 +22,7 @@ dependencies: - patsy - psycopg2 - py - - pyarrow=0.4.1 + - pyarrow=0.7.0 - PyCrypto - pymysql=0.6.3 - pytables diff --git a/doc/source/install.rst b/doc/source/install.rst index b32c5b1145e85e..89f7b580303f58 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -258,8 +258,8 @@ Optional Dependencies * `SciPy `__: miscellaneous statistical functions, Version 0.18.1 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage, Version 3.4.2 or higher -* `pyarrow `__ (>= 0.4.1): necessary for feather-based storage. -* `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. +* `pyarrow `__ (>= 0.7.0): necessary for feather-based storage. +* `Apache Parquet `__, either `pyarrow `__ (>= 0.7.0) or `fastparquet `__ (>= 0.1.2) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: * `psycopg2 `__: for PostgreSQL diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 7afe24880eadc7..6ace245a4bae16 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -250,7 +250,7 @@ Backwards incompatible API changes Dependencies have increased minimum versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We have updated our minimum supported versions of dependencies (:issue:`21242`). +We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`). If installed, we now require: +-----------------+-----------------+----------+ @@ -268,6 +268,10 @@ If installed, we now require: +-----------------+-----------------+----------+ | scipy | 0.18.1 | | +-----------------+-----------------+----------+ +| pyarrow | 0.7.0 | | ++-----------------+-----------------+----------+ +| fastparquet | 0.1.2 | | ++-----------------+-----------------+----------+ Additionally we no longer depend on `feather-format` for feather based storage and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`). diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2c75f46385e868..160a26533fb89a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -5,7 +5,7 @@ from pandas.compat import string_types -from pandas import DataFrame, Int64Index, RangeIndex, get_option +from pandas import DataFrame, get_option import pandas.core.common as com from pandas.io.common import get_filepath_or_buffer, is_s3_url @@ -89,29 +89,20 @@ def __init__(self): "\nor via pip\n" "pip install -U pyarrow\n" ) - if LooseVersion(pyarrow.__version__) < '0.4.1': + if LooseVersion(pyarrow.__version__) < '0.7.0': raise ImportError( - "pyarrow >= 0.4.1 is required for parquet support\n\n" + "pyarrow >= 0.7.0 is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) - self._pyarrow_lt_060 = ( - LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0')) - self._pyarrow_lt_070 = ( - LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0')) - self.api = pyarrow def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, **kwargs): self.validate_dataframe(df) - - # Only validate the index if we're writing it. - if self._pyarrow_lt_070 and index is not False: - self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: @@ -119,27 +110,17 @@ def write(self, df, path, compression='snappy', else: from_pandas_kwargs = {'preserve_index': index} - if self._pyarrow_lt_060: - table = self.api.Table.from_pandas(df, timestamps_to_ms=True, - **from_pandas_kwargs) - self.api.parquet.write_table( - table, path, compression=compression, **kwargs) - - else: - table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - self.api.parquet.write_table( - table, path, compression=compression, - coerce_timestamps=coerce_timestamps, **kwargs) + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + self.api.parquet.write_table( + table, path, compression=compression, + coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) - if self._pyarrow_lt_070: - result = self.api.parquet.read_pandas(path, columns=columns, - **kwargs).to_pandas() - else: - kwargs['use_pandas_metadata'] = True - result = self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + + kwargs['use_pandas_metadata'] = True + result = self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() if should_close: try: path.close() @@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs): return result - def _validate_write_lt_070(self, df): - # Compatibility shim for pyarrow < 0.7.0 - # TODO: Remove in pandas 0.23.0 - from pandas.core.indexes.multi import MultiIndex - if isinstance(df.index, MultiIndex): - msg = ( - "Multi-index DataFrames are only supported " - "with pyarrow >= 0.7.0" - ) - raise ValueError(msg) - # Validate index - if not isinstance(df.index, Int64Index): - msg = ( - "pyarrow < 0.7.0 does not support serializing {} for the " - "index; you can .reset_index() to make the index into " - "column(s), or install the latest version of pyarrow or " - "fastparquet." - ) - raise ValueError(msg.format(type(df.index))) - if not df.index.equals(RangeIndex(len(df))): - raise ValueError( - "pyarrow < 0.7.0 does not support serializing a non-default " - "index; you can .reset_index() to make the index into " - "column(s), or install the latest version of pyarrow or " - "fastparquet." - ) - if df.index.name is not None: - raise ValueError( - "pyarrow < 0.7.0 does not serialize indexes with a name; you " - "can set the index.name to None or install the latest version " - "of pyarrow or fastparquet." - ) - class FastParquetImpl(BaseImpl): @@ -197,9 +145,9 @@ def __init__(self): "\nor via pip\n" "pip install -U fastparquet" ) - if LooseVersion(fastparquet.__version__) < '0.1.0': + if LooseVersion(fastparquet.__version__) < '0.1.2': raise ImportError( - "fastparquet >= 0.1.0 is required for parquet " + "fastparquet >= 0.1.2 is required for parquet " "support\n\n" "you can install via conda\n" "conda install fastparquet -c conda-forge\n" diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4c58d8ce29d8b6..3b3e7f757bf60d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -41,22 +41,6 @@ def engine(request): @pytest.fixture def pa(): - if not _HAVE_PYARROW: - pytest.skip("pyarrow is not installed") - return 'pyarrow' - - -@pytest.fixture -def pa_lt_070(): - if not _HAVE_PYARROW: - pytest.skip("pyarrow is not installed") - if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'): - pytest.skip("pyarrow is >= 0.7.0") - return 'pyarrow' - - -@pytest.fixture -def pa_ge_070(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'): @@ -337,9 +321,9 @@ def test_write_index(self, engine): df.index.name = 'foo' check_round_trip(df, engine) - def test_write_multiindex(self, pa_ge_070): + def test_write_multiindex(self, pa): # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version - engine = pa_ge_070 + engine = pa df = pd.DataFrame({'A': [1, 2, 3]}) index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) @@ -352,8 +336,8 @@ def test_write_column_multiindex(self, engine): df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) - def test_multiindex_with_columns(self, pa_ge_070): - engine = pa_ge_070 + def test_multiindex_with_columns(self, pa): + engine = pa dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list('ABC')) @@ -456,8 +440,7 @@ def test_unsupported(self, pa): # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) - def test_categorical(self, pa_ge_070): - pa = pa_ge_070 + def test_categorical(self, pa): # supported in >= 0.7.0 df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) @@ -466,13 +449,6 @@ def test_categorical(self, pa_ge_070): expected = df.assign(a=df.a.astype(object)) check_round_trip(df, pa, expected=expected) - def test_categorical_unsupported(self, pa_lt_070): - pa = pa_lt_070 - - # supported in >= 0.7.0 - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) - self.check_error_on_write(df, pa, NotImplementedError) - def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa,