From 0cd24735cbde38ae2cfca5ddbd399451baf307f4 Mon Sep 17 00:00:00 2001
From: Anjana S <anjana695@cse.ssn.edu.in>
Date: Tue, 6 Nov 2018 03:16:10 +0530
Subject: [PATCH] Bumping up min version for pyarrow and fastparquet (#23482)

* Bumping up min version for pyarrow
---
 ci/requirements-optional-conda.txt |  4 +-
 ci/requirements-optional-pip.txt   |  8 +--
 ci/travis-27.yaml                  |  2 +-
 doc/source/install.rst             |  4 +-
 doc/source/whatsnew/v0.24.0.txt    |  6 ++-
 pandas/io/parquet.py               | 78 +++++-------------------------
 pandas/tests/io/test_parquet.py    | 34 ++-----------
 7 files changed, 32 insertions(+), 104 deletions(-)

diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
index c9dc385b879863..8758c8154abca2 100644
--- a/ci/requirements-optional-conda.txt
+++ b/ci/requirements-optional-conda.txt
@@ -1,7 +1,7 @@
 beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
-fastparquet
+fastparquet>=0.1.2
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -12,7 +12,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow>=0.4.1
+pyarrow>=0.7.0
 pymysql
 pytables>=3.4.2
 pytest-cov
diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
index 347ea0d9832b04..62f1c555d8544a 100644
--- a/ci/requirements-optional-pip.txt
+++ b/ci/requirements-optional-pip.txt
@@ -3,7 +3,7 @@
 beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
-fastparquet
+fastparquet>=0.1.2
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -14,9 +14,9 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow>=0.4.1
+pyarrow>=0.7.0
 pymysql
-tables
+pytables>=3.4.2
 pytest-cov
 pytest-xdist
 s3fs
@@ -27,4 +27,4 @@ statsmodels
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
\ No newline at end of file
diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml
index 9641a76152d7b0..28bee387a4f4a9 100644
--- a/ci/travis-27.yaml
+++ b/ci/travis-27.yaml
@@ -22,7 +22,7 @@ dependencies:
   - patsy
   - psycopg2
   - py
-  - pyarrow=0.4.1
+  - pyarrow=0.7.0
   - PyCrypto
   - pymysql=0.6.3
   - pytables
diff --git a/doc/source/install.rst b/doc/source/install.rst
index b32c5b1145e85e..89f7b580303f58 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -258,8 +258,8 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
-* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
-* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
+* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0): necessary for feather-based storage.
+* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.1.2) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
     * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 7afe24880eadc7..6ace245a4bae16 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -250,7 +250,7 @@ Backwards incompatible API changes
 Dependencies have increased minimum versions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We have updated our minimum supported versions of dependencies (:issue:`21242`).
+We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`).
 If installed, we now require:
 
 +-----------------+-----------------+----------+
@@ -268,6 +268,10 @@ If installed, we now require:
 +-----------------+-----------------+----------+
 | scipy           | 0.18.1          |          |
 +-----------------+-----------------+----------+
+| pyarrow         | 0.7.0           |          |
++-----------------+-----------------+----------+
+| fastparquet     | 0.1.2           |          |
++-----------------+-----------------+----------+
 
 Additionally we no longer depend on `feather-format` for feather based storage
 and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 2c75f46385e868..160a26533fb89a 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -5,7 +5,7 @@
 
 from pandas.compat import string_types
 
-from pandas import DataFrame, Int64Index, RangeIndex, get_option
+from pandas import DataFrame, get_option
 import pandas.core.common as com
 
 from pandas.io.common import get_filepath_or_buffer, is_s3_url
@@ -89,29 +89,20 @@ def __init__(self):
                 "\nor via pip\n"
                 "pip install -U pyarrow\n"
             )
-        if LooseVersion(pyarrow.__version__) < '0.4.1':
+        if LooseVersion(pyarrow.__version__) < '0.7.0':
             raise ImportError(
-                "pyarrow >= 0.4.1 is required for parquet support\n\n"
+                "pyarrow >= 0.7.0 is required for parquet support\n\n"
                 "you can install via conda\n"
                 "conda install pyarrow -c conda-forge\n"
                 "\nor via pip\n"
                 "pip install -U pyarrow\n"
             )
 
-        self._pyarrow_lt_060 = (
-            LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
-        self._pyarrow_lt_070 = (
-            LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
-
         self.api = pyarrow
 
     def write(self, df, path, compression='snappy',
               coerce_timestamps='ms', index=None, **kwargs):
         self.validate_dataframe(df)
-
-        # Only validate the index if we're writing it.
-        if self._pyarrow_lt_070 and index is not False:
-            self._validate_write_lt_070(df)
         path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
 
         if index is None:
@@ -119,27 +110,17 @@ def write(self, df, path, compression='snappy',
         else:
             from_pandas_kwargs = {'preserve_index': index}
 
-        if self._pyarrow_lt_060:
-            table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
-                                               **from_pandas_kwargs)
-            self.api.parquet.write_table(
-                table, path, compression=compression, **kwargs)
-
-        else:
-            table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
-            self.api.parquet.write_table(
-                table, path, compression=compression,
-                coerce_timestamps=coerce_timestamps, **kwargs)
+        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+        self.api.parquet.write_table(
+            table, path, compression=compression,
+            coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         path, _, _, should_close = get_filepath_or_buffer(path)
-        if self._pyarrow_lt_070:
-            result = self.api.parquet.read_pandas(path, columns=columns,
-                                                  **kwargs).to_pandas()
-        else:
-            kwargs['use_pandas_metadata'] = True
-            result = self.api.parquet.read_table(path, columns=columns,
-                                                 **kwargs).to_pandas()
+
+        kwargs['use_pandas_metadata'] = True
+        result = self.api.parquet.read_table(path, columns=columns,
+                                             **kwargs).to_pandas()
         if should_close:
             try:
                 path.close()
@@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs):
 
         return result
 
-    def _validate_write_lt_070(self, df):
-        # Compatibility shim for pyarrow < 0.7.0
-        # TODO: Remove in pandas 0.23.0
-        from pandas.core.indexes.multi import MultiIndex
-        if isinstance(df.index, MultiIndex):
-            msg = (
-                "Multi-index DataFrames are only supported "
-                "with pyarrow >= 0.7.0"
-            )
-            raise ValueError(msg)
-        # Validate index
-        if not isinstance(df.index, Int64Index):
-            msg = (
-                "pyarrow < 0.7.0 does not support serializing {} for the "
-                "index; you can .reset_index() to make the index into "
-                "column(s), or install the latest version of pyarrow or "
-                "fastparquet."
-            )
-            raise ValueError(msg.format(type(df.index)))
-        if not df.index.equals(RangeIndex(len(df))):
-            raise ValueError(
-                "pyarrow < 0.7.0 does not support serializing a non-default "
-                "index; you can .reset_index() to make the index into "
-                "column(s), or install the latest version of pyarrow or "
-                "fastparquet."
-            )
-        if df.index.name is not None:
-            raise ValueError(
-                "pyarrow < 0.7.0 does not serialize indexes with a name; you "
-                "can set the index.name to None or install the latest version "
-                "of pyarrow or fastparquet."
-            )
-
 
 class FastParquetImpl(BaseImpl):
 
@@ -197,9 +145,9 @@ def __init__(self):
                 "\nor via pip\n"
                 "pip install -U fastparquet"
             )
-        if LooseVersion(fastparquet.__version__) < '0.1.0':
+        if LooseVersion(fastparquet.__version__) < '0.1.2':
             raise ImportError(
-                "fastparquet >= 0.1.0 is required for parquet "
+                "fastparquet >= 0.1.2 is required for parquet "
                 "support\n\n"
                 "you can install via conda\n"
                 "conda install fastparquet -c conda-forge\n"
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4c58d8ce29d8b6..3b3e7f757bf60d 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -41,22 +41,6 @@ def engine(request):
 
 @pytest.fixture
 def pa():
-    if not _HAVE_PYARROW:
-        pytest.skip("pyarrow is not installed")
-    return 'pyarrow'
-
-
-@pytest.fixture
-def pa_lt_070():
-    if not _HAVE_PYARROW:
-        pytest.skip("pyarrow is not installed")
-    if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
-        pytest.skip("pyarrow is >= 0.7.0")
-    return 'pyarrow'
-
-
-@pytest.fixture
-def pa_ge_070():
     if not _HAVE_PYARROW:
         pytest.skip("pyarrow is not installed")
     if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
@@ -337,9 +321,9 @@ def test_write_index(self, engine):
         df.index.name = 'foo'
         check_round_trip(df, engine)
 
-    def test_write_multiindex(self, pa_ge_070):
+    def test_write_multiindex(self, pa):
         # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
-        engine = pa_ge_070
+        engine = pa
 
         df = pd.DataFrame({'A': [1, 2, 3]})
         index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
@@ -352,8 +336,8 @@ def test_write_column_multiindex(self, engine):
         df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
         self.check_error_on_write(df, engine, ValueError)
 
-    def test_multiindex_with_columns(self, pa_ge_070):
-        engine = pa_ge_070
+    def test_multiindex_with_columns(self, pa):
+        engine = pa
         dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
         df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
                           columns=list('ABC'))
@@ -456,8 +440,7 @@ def test_unsupported(self, pa):
         # older pyarrows raise ArrowInvalid
         self.check_error_on_write(df, pa, Exception)
 
-    def test_categorical(self, pa_ge_070):
-        pa = pa_ge_070
+    def test_categorical(self, pa):
 
         # supported in >= 0.7.0
         df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
@@ -466,13 +449,6 @@ def test_categorical(self, pa_ge_070):
         expected = df.assign(a=df.a.astype(object))
         check_round_trip(df, pa, expected=expected)
 
-    def test_categorical_unsupported(self, pa_lt_070):
-        pa = pa_lt_070
-
-        # supported in >= 0.7.0
-        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
-        self.check_error_on_write(df, pa, NotImplementedError)
-
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):
         # GH #19134
         check_round_trip(df_compat, pa,