fetch from master and merge

pandas-dev · Aug 30, 2021 · 6e9882d · 6e9882d
2 parents 101c0c2 + 40faba2
commit 6e9882d
Show file tree

Hide file tree

Showing 70 changed files with 1,023 additions and 263 deletions.
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
@@ -115,19 +115,27 @@ def time_maybe_convert_objects(self):
 class ToDatetimeFromIntsFloats:
     def setup(self):
         self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
+        self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64")
         self.ts_sec_float = self.ts_sec.astype("float64")
 
         self.ts_nanosec = 1_000_000 * self.ts_sec
+        self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint
         self.ts_nanosec_float = self.ts_nanosec.astype("float64")
 
-    # speed of int64 and float64 paths should be comparable
+    # speed of int64, uint64 and float64 paths should be comparable
 
     def time_nanosec_int64(self):
         to_datetime(self.ts_nanosec, unit="ns")
 
+    def time_nanosec_uint64(self):
+        to_datetime(self.ts_nanosec_uint, unit="ns")
+
     def time_nanosec_float64(self):
         to_datetime(self.ts_nanosec_float, unit="ns")
 
+    def time_sec_uint64(self):
+        to_datetime(self.ts_sec_uint, unit="s")
+
     def time_sec_int64(self):
         to_datetime(self.ts_sec, unit="s")
 

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -206,7 +206,7 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = ([None, 10000], ["c", "python"])
+    params = ([None, 10000], ["c", "python", "pyarrow"])
     param_names = ["skiprows", "engine"]
 
     def setup(self, skiprows, engine):
@@ -320,7 +320,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
 
 
 class ReadCSVEngine(StringIORewind):
-    params = ["c", "python"]
+    params = ["c", "python", "pyarrow"]
     param_names = ["engine"]
 
     def setup(self, engine):

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -31,7 +31,6 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.2
-  - aiobotocore<=1.3.3
   - scipy
   - sqlalchemy
   - xlrd

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -30,7 +30,6 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.0
-  - aiobotocore<=1.3.3
   - scipy
   - xlrd
   - xlsxwriter

diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml
@@ -32,7 +32,6 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.2
-  - aiobotocore<=1.3.3
   - scipy
   - sqlalchemy
   - xlrd

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -132,6 +132,9 @@ Installing from PyPI
 pandas can be installed via pip from
 `PyPI <https://pypi.org/project/pandas>`__.
 
+.. note::
+    You must have ``pip>=19.3`` to install from PyPI.
+
 ::
 
     pip install pandas

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -160,9 +160,15 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use ``str`` or ``object`` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'python'``}
-  Parser engine to use. The C engine is faster while the Python engine is
-  currently more feature-complete.
+engine : {``'c'``, ``'python'``, ``'pyarrow'``}
+  Parser engine to use. The C and pyarrow engines are faster, while the python engine
+  is currently more feature-complete. Multithreading is currently only supported by
+  the pyarrow engine.
+
+  .. versionadded:: 1.4.0
+
+     The "pyarrow" engine was added as an *experimental* engine, and some features
+     are unsupported, or may not work correctly, with this engine.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.
@@ -1622,11 +1628,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 Specifying the parser engine
 ''''''''''''''''''''''''''''
 
-Under the hood pandas uses a fast and efficient parser implemented in C as well
-as a Python implementation which is currently more feature-complete. Where
-possible pandas uses the C parser (specified as ``engine='c'``), but may fall
-back to Python if C-unsupported options are specified. Currently, C-unsupported
-options include:
+Pandas currently supports three engines, the C engine, the python engine, and an experimental
+pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest
+on larger workloads and is equivalent in speed to the C engine on most other workloads.
+The python engine tends to be slower than the pyarrow and C engines on most workloads. However,
+the pyarrow engine is much less robust than the C engine, which lacks a few features compared to the
+Python engine.
+
+Where possible, pandas uses the C parser (specified as ``engine='c'``), but it may fall
+back to Python if C-unsupported options are specified.
+
+Currently, options unsupported by the C and pyarrow engines include:
 
 * ``sep`` other than a single character (e.g. regex separators)
 * ``skipfooter``
@@ -1635,6 +1647,32 @@ options include:
 Specifying any of the above options will produce a ``ParserWarning`` unless the
 python engine is selected explicitly using ``engine='python'``.
 
+Options that are unsupported by the pyarrow engine which are not covered by the list above include:
+
+* ``float_precision``
+* ``chunksize``
+* ``comment``
+* ``nrows``
+* ``thousands``
+* ``memory_map``
+* ``dialect``
+* ``warn_bad_lines``
+* ``error_bad_lines``
+* ``on_bad_lines``
+* ``delim_whitespace``
+* ``quoting``
+* ``lineterminator``
+* ``converters``
+* ``decimal``
+* ``iterator``
+* ``dayfirst``
+* ``infer_datetime_format``
+* ``verbose``
+* ``skipinitialspace``
+* ``low_memory``
+
+Specifying these options with ``engine='pyarrow'`` will raise a ``ValueError``.
+
 .. _io.remote:
 
 Reading/writing remote files

diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst
@@ -17,6 +17,16 @@ Fixed regressions
 - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
 - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
 - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
+- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_133.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement for :meth:`DataFrame.__setitem__` when the key or value is not a :class:`DataFrame`, or key is not list-like (:issue:`43274`)
+-
 -
 
 .. ---------------------------------------------------------------------------
@@ -25,7 +35,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
--
+- Bug in :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` with ``engine="numba"`` where ``index`` data was not being correctly passed into ``func`` (:issue:`43133`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -78,10 +78,13 @@ Styler
 
 There are also bug fixes and deprecations listed below.
 
-.. _whatsnew_140.enhancements.enhancement2:
+.. _whatsnew_140.enhancements.pyarrow_csv_engine:
 
-enhancement2
-^^^^^^^^^^^^
+Multithreaded CSV reading with a new CSV Engine based on pyarrow
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. _whatsnew_140.enhancements.other:
 
@@ -242,7 +245,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
 - Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
 - Performance improvement in :func:`read_stata` (:issue:`43059`)
--
+- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)
 
 .. ---------------------------------------------------------------------------
 
@@ -322,6 +325,7 @@ MultiIndex
 - Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
 - Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
 - Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`)
+- Bug in :meth:`MultiIndex.putmask` where the other value was also a :class:`MultiIndex` (:issue:`43212`)
 -
 
 I/O

diff --git a/environment.yml b/environment.yml
@@ -105,7 +105,7 @@ dependencies:
 
   - pytables>=3.6.1  # pandas.read_hdf, DataFrame.to_hdf
   - s3fs>=0.4.0  # file IO when using 's3://...' path
-  - aiobotocore<=1.3.3  # Remove when s3fs is at 2021.08.0
+  - aiobotocore
   - fsspec>=0.7.4, <2021.6.0  # for generic remote file operations
   - gcsfs>=0.6.0  # file IO when using 'gcs://...' path
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -248,7 +248,7 @@ def array_with_unit_to_datetime(
         # if we have nulls that are not type-compat
         # then need to iterate
 
-        if values.dtype.kind == "i" or values.dtype.kind == "f":
+        if values.dtype.kind in ["i", "f", "u"]:
             iresult = values.astype("i8", copy=False)
             # fill missing values by comparing to NPY_NAT
             mask = iresult == NPY_NAT
@@ -263,7 +263,7 @@ def array_with_unit_to_datetime(
             ):
                 raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
 
-            if values.dtype.kind == "i":
+            if values.dtype.kind in ["i", "u"]:
                 result = (iresult * m).astype("M8[ns]")
 
             elif values.dtype.kind == "f":

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1653,6 +1653,8 @@ def to_numpy(
                [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
         """
         self._consolidate_inplace()
+        if dtype is not None:
+            dtype = np.dtype(dtype)
         result = self._mgr.as_array(
             transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
         )
@@ -3620,9 +3622,11 @@ def __setitem__(self, key, value):
             self._setitem_array(key, value)
         elif isinstance(value, DataFrame):
             self._set_item_frame_value(key, value)
-        elif is_list_like(value) and 1 < len(
-            self.columns.get_indexer_for([key])
-        ) == len(value):
+        elif (
+            is_list_like(value)
+            and not self.columns.is_unique
+            and 1 < len(self.columns.get_indexer_for([key])) == len(value)
+        ):
             # Column to set is duplicated
             self._setitem_array([key], value)
         else:
@@ -9824,26 +9828,28 @@ def _reduce(
         assert filter_type is None or filter_type == "bool", filter_type
         out_dtype = "bool" if filter_type == "bool" else None
 
-        own_dtypes = [arr.dtype for arr in self._iter_column_arrays()]
+        if numeric_only is None and name in ["mean", "median"]:
+            own_dtypes = [arr.dtype for arr in self._mgr.arrays]
 
-        dtype_is_dt = np.array(
-            [is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
-            dtype=bool,
-        )
-        if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
-            warnings.warn(
-                "DataFrame.mean and DataFrame.median with numeric_only=None "
-                "will include datetime64 and datetime64tz columns in a "
-                "future version.",
-                FutureWarning,
-                stacklevel=5,
+            dtype_is_dt = np.array(
+                [is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
+                dtype=bool,
             )
-            # Non-copy equivalent to
-            #  cols = self.columns[~dtype_is_dt]
-            #  self = self[cols]
-            predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
-            mgr = self._mgr._get_data_subset(predicate)
-            self = type(self)(mgr)
+            if dtype_is_dt.any():
+                warnings.warn(
+                    "DataFrame.mean and DataFrame.median with numeric_only=None "
+                    "will include datetime64 and datetime64tz columns in a "
+                    "future version.",
+                    FutureWarning,
+                    stacklevel=5,
+                )
+                # Non-copy equivalent to
+                #  dt64_cols = self.dtypes.apply(is_datetime64_any_dtype)
+                #  cols = self.columns[~dt64_cols]
+                #  self = self[cols]
+                predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
+                mgr = self._mgr._get_data_subset(predicate)
+                self = type(self)(mgr)
 
         # TODO: Make other agg func handle axis=None properly GH#21597
         axis = self._get_axis_number(axis)