From 8a457c725012f805eb5f26b76c8018092c1ea416 Mon Sep 17 00:00:00 2001
From: aflah02 <72096386+aflah02@users.noreply.github.com>
Date: Wed, 16 Dec 2020 21:48:40 +0530
Subject: [PATCH 01/17] Updated README (#38491)

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 6d1d890c54093..f238e219bd3d8 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ The source code is currently hosted on GitHub at:
 https://github.com/pandas-dev/pandas
 
 Binary installers for the latest released version are available at the [Python
-package index](https://pypi.org/project/pandas) and on conda.
+Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/).
 
 ```sh
 # conda
@@ -100,15 +100,15 @@ pip install pandas
 ```
 
 ## Dependencies
-- [NumPy](https://www.numpy.org)
-- [python-dateutil](https://labix.org/python-dateutil)
-- [pytz](https://pythonhosted.org/pytz)
+- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org)
+- [python-dateutil - Provides powerful extensions to the standard datetime module](https://labix.org/python-dateutil)
+- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://pythonhosted.org/pytz)
 
 See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies.
 
 ## Installation from sources
-To install pandas from source you need Cython in addition to the normal
-dependencies above. Cython can be installed from pypi:
+To install pandas from source you need [Cython](https://cython.org/) in addition to the normal
+dependencies above. Cython can be installed from PyPI:
 
 ```sh
 pip install cython
@@ -145,7 +145,7 @@ See the full instructions for [installing from source](https://pandas.pydata.org
 The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable
 
 ## Background
-Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and
+Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and
 has been under active development since then.
 
 ## Getting Help

From d210962d0da0f2fa3d0c433122a81ca98958b90b Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Dec 2020 11:26:02 -0800
Subject: [PATCH 02/17] CI: pin xlrd<2.0 (#38526)

---
 ci/deps/azure-37-slow.yaml    | 2 +-
 ci/deps/azure-38-locale.yaml  | 2 +-
 ci/deps/azure-macos-37.yaml   | 2 +-
 ci/deps/azure-windows-37.yaml | 2 +-
 ci/deps/azure-windows-38.yaml | 2 +-
 ci/deps/travis-37-cov.yaml    | 2 +-
 ci/deps/travis-37-locale.yaml | 2 +-
 ci/deps/travis-38-slow.yaml   | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml
index 50fccf86b6340..05b33fa351ac9 100644
--- a/ci/deps/azure-37-slow.yaml
+++ b/ci/deps/azure-37-slow.yaml
@@ -31,7 +31,7 @@ dependencies:
   - moto>=1.3.14
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto
diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml
index f879111a32e67..90cd11037e472 100644
--- a/ci/deps/azure-38-locale.yaml
+++ b/ci/deps/azure-38-locale.yaml
@@ -30,7 +30,7 @@ dependencies:
   - pytz
   - scipy
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto
diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml
index 31e0ffca81424..0b8aff83fe230 100644
--- a/ci/deps/azure-macos-37.yaml
+++ b/ci/deps/azure-macos-37.yaml
@@ -26,7 +26,7 @@ dependencies:
   - python-dateutil==2.7.3
   - pytz
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pip
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index 16b4bd72683b4..ad72b9c8577e9 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -33,7 +33,7 @@ dependencies:
   - s3fs>=0.4.2
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pyreadstat
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
index 449bbd05991bf..08693e02aa8d3 100644
--- a/ci/deps/azure-windows-38.yaml
+++ b/ci/deps/azure-windows-38.yaml
@@ -31,6 +31,6 @@ dependencies:
   - pytz
   - s3fs>=0.4.0
   - scipy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml
index c89b42ef06a2e..b68ff0672888a 100644
--- a/ci/deps/travis-37-cov.yaml
+++ b/ci/deps/travis-37-cov.yaml
@@ -43,7 +43,7 @@ dependencies:
   - sqlalchemy
   - statsmodels
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pip
diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml
index 4e442b10482a7..60a92c4dfd3c6 100644
--- a/ci/deps/travis-37-locale.yaml
+++ b/ci/deps/travis-37-locale.yaml
@@ -35,7 +35,7 @@ dependencies:
   - pytables>=3.5.1
   - scipy
   - xarray=0.12.3
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto
diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/travis-38-slow.yaml
index e4b719006a11e..2b4339cf12658 100644
--- a/ci/deps/travis-38-slow.yaml
+++ b/ci/deps/travis-38-slow.yaml
@@ -30,7 +30,7 @@ dependencies:
   - moto>=1.3.14
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto

From 7043f8fa9d4d97782ec0d0d1a4c3b57573a7fc21 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Dec 2020 14:32:23 -0800
Subject: [PATCH 03/17] REF: use astype_nansafe in Index.astype (#38518)

---
 pandas/core/indexes/base.py    | 20 ++++++++++----------
 pandas/core/indexes/numeric.py | 19 -------------------
 2 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index bafb37775cbb1..2101893d39dc9 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -33,6 +33,7 @@
 from pandas.util._decorators import Appender, cache_readonly, doc
 
 from pandas.core.dtypes.cast import (
+    astype_nansafe,
     find_common_type,
     maybe_cast_to_integer_array,
     maybe_promote,
@@ -693,22 +694,21 @@ def astype(self, dtype, copy=True):
         if is_dtype_equal(self.dtype, dtype):
             return self.copy() if copy else self
 
-        elif is_categorical_dtype(dtype):
-            from pandas.core.indexes.category import CategoricalIndex
-
-            return CategoricalIndex(
-                self._values, name=self.name, dtype=dtype, copy=copy
+        if needs_i8_conversion(dtype) and is_float_dtype(self.dtype):
+            # We can't put this into astype_nansafe bc astype_nansafe allows
+            #  casting np.nan to NaT
+            raise TypeError(
+                f"Cannot convert {type(self).__name__} to dtype {dtype}; integer "
+                "values are required for conversion"
             )
 
-        elif is_extension_array_dtype(dtype):
-            return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy)
-
         try:
-            casted = self._values.astype(dtype, copy=copy)
-        except (TypeError, ValueError) as err:
+            casted = astype_nansafe(self._values, dtype=dtype, copy=True)
+        except TypeError as err:
             raise TypeError(
                 f"Cannot cast {type(self).__name__} to dtype {dtype}"
             ) from err
+
         return Index(casted, name=self.name, dtype=dtype)
 
     _index_shared_docs[
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index 91d27d9922aa5..d6f91c9a06739 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -7,12 +7,10 @@
 from pandas._typing import Dtype, DtypeObj, Label
 from pandas.util._decorators import doc
 
-from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
     is_bool,
     is_bool_dtype,
     is_dtype_equal,
-    is_extension_array_dtype,
     is_float,
     is_float_dtype,
     is_integer_dtype,
@@ -21,8 +19,6 @@
     is_scalar,
     is_signed_integer_dtype,
     is_unsigned_integer_dtype,
-    needs_i8_conversion,
-    pandas_dtype,
 )
 from pandas.core.dtypes.generic import ABCSeries
 from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna
@@ -332,21 +328,6 @@ def inferred_type(self) -> str:
         """
         return "floating"
 
-    @doc(Index.astype)
-    def astype(self, dtype, copy=True):
-        dtype = pandas_dtype(dtype)
-        if needs_i8_conversion(dtype):
-            raise TypeError(
-                f"Cannot convert Float64Index to dtype {dtype}; integer "
-                "values are required for conversion"
-            )
-        elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype):
-            # TODO(jreback); this can change once we have an EA Index type
-            # GH 13149
-            arr = astype_nansafe(self._values, dtype=dtype)
-            return Int64Index(arr, name=self.name)
-        return super().astype(dtype, copy=copy)
-
     # ----------------------------------------------------------------
     # Indexing Methods
 

From 7d8a052ee869ee547d204f53b15a5dc7c6b3f0c3 Mon Sep 17 00:00:00 2001
From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com>
Date: Wed, 16 Dec 2020 20:23:05 -0500
Subject: [PATCH 04/17] BENCH/REF: parametrize CSV benchmarks on engine
 (#38442)

---
 asv_bench/benchmarks/io/csv.py | 95 ++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 32 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 9bcd125f56bbb..24d21ad6a633d 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "python"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
 class ReadCSVThousands(BaseIO):
 
     fname = "__test__.csv"
-    params = ([",", "|"], [None, ","])
-    param_names = ["sep", "thousands"]
+    params = ([",", "|"], [None, ","], ["c", "python"])
+    param_names = ["sep", "thousands", "engine"]
 
-    def setup(self, sep, thousands):
+    def setup(self, sep, thousands, engine):
         N = 10000
         K = 8
         data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
@@ -206,16 +206,19 @@ def setup(self, sep, thousands):
             df = df.applymap(lambda x: fmt.format(x))
         df.to_csv(self.fname, sep=sep)
 
-    def time_thousands(self, sep, thousands):
-        read_csv(self.fname, sep=sep, thousands=thousands)
+    def time_thousands(self, sep, thousands, engine):
+        read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)
 
 
 class ReadCSVComment(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
 
-    def time_comment(self):
+    def time_comment(self, engine):
         read_csv(
             self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
         )
@@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
         )
 
 
+class ReadCSVEngine(StringIORewind):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
+
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
+
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
+
 class ReadCSVCategorical(BaseIO):
 
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         N = 100000
         group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
         df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
         df.to_csv(self.fname, index=False)
 
-    def time_convert_post(self):
-        read_csv(self.fname).apply(Categorical)
+    def time_convert_post(self, engine):
+        read_csv(self.fname, engine=engine).apply(Categorical)
 
-    def time_convert_direct(self):
-        read_csv(self.fname, dtype="category")
+    def time_convert_direct(self, engine):
+        read_csv(self.fname, engine=engine, dtype="category")
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +309,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -304,17 +331,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         with open(self.fname, "w") as f:
             for i in range(self.num_rows):
                 f.write(f"{i}\n")
 
-    def mem_parser_chunks(self):
+    def mem_parser_chunks(self, engine):
         # see gh-24805.
-        result = read_csv(self.fname, chunksize=self.chunksize)
+        result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)
 
         for _ in result:
             pass
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],

From a66482e129a438f013962db2f6cd778d20be1bba Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Dec 2020 17:23:32 -0800
Subject: [PATCH 05/17] CLN: remove CategoricalIndex._engine (#38529)

---
 pandas/core/indexes/category.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index e2a7752cf3f0d..7c826000d035a 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -8,7 +8,7 @@
 from pandas._libs import index as libindex
 from pandas._libs.lib import no_default
 from pandas._typing import ArrayLike, Label
-from pandas.util._decorators import Appender, cache_readonly, doc
+from pandas.util._decorators import Appender, doc
 
 from pandas.core.dtypes.common import (
     ensure_platform_int,
@@ -381,14 +381,6 @@ def fillna(self, value, downcast=None):
         cat = self._data.fillna(value)
         return type(self)._simple_new(cat, name=self.name)
 
-    @cache_readonly
-    def _engine(self):
-        # we are going to look things up with the codes themselves.
-        # To avoid a reference cycle, bind `codes` to a local variable, so
-        # `self` is not passed into the lambda.
-        codes = self.codes
-        return self._engine_type(lambda: codes, len(self))
-
     @doc(Index.unique)
     def unique(self, level=None):
         if level is not None:

From 0556613072fe44b88289d908992174a5b8509019 Mon Sep 17 00:00:00 2001
From: patrick <61934744+phofl@users.noreply.github.com>
Date: Thu, 17 Dec 2020 03:12:42 +0100
Subject: [PATCH 06/17] BUG: MultiIndex.equals returning incorrectly True when
 Indexes contains NaN (#38511)

---
 doc/source/whatsnew/v1.3.0.rst                 |  2 +-
 pandas/core/indexes/multi.py                   | 10 +++++++---
 pandas/tests/indexes/multi/test_equivalence.py | 10 ++++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 57dd1d05a274e..af96269019ca4 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -232,7 +232,7 @@ MultiIndex
 ^^^^^^^^^^
 
 - Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
--
+- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`)
 
 I/O
 ^^^
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 1edd98e980a2d..78e7a8516178a 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -3454,13 +3454,17 @@ def equals(self, other: object) -> bool:
 
         for i in range(self.nlevels):
             self_codes = self.codes[i]
-            self_codes = self_codes[self_codes != -1]
+            other_codes = other.codes[i]
+            self_mask = self_codes == -1
+            other_mask = other_codes == -1
+            if not np.array_equal(self_mask, other_mask):
+                return False
+            self_codes = self_codes[~self_mask]
             self_values = algos.take_nd(
                 np.asarray(self.levels[i]._values), self_codes, allow_fill=False
             )
 
-            other_codes = other.codes[i]
-            other_codes = other_codes[other_codes != -1]
+            other_codes = other_codes[~other_mask]
             other_values = algos.take_nd(
                 np.asarray(other.levels[i]._values), other_codes, allow_fill=False
             )
diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py
index c31c2416ff722..bb34760e28d96 100644
--- a/pandas/tests/indexes/multi/test_equivalence.py
+++ b/pandas/tests/indexes/multi/test_equivalence.py
@@ -209,6 +209,16 @@ def test_equals_missing_values():
     assert not result
 
 
+def test_equals_missing_values_differently_sorted():
+    # GH#38439
+    mi1 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
+    mi2 = pd.MultiIndex.from_tuples([(np.nan, np.nan), (81.0, np.nan)])
+    assert not mi1.equals(mi2)
+
+    mi2 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
+    assert mi1.equals(mi2)
+
+
 def test_is_():
     mi = MultiIndex.from_tuples(zip(range(10), range(10)))
     assert mi.is_(mi)

From d4b623361bf18b42c4074d7b5935101514cf128a Mon Sep 17 00:00:00 2001
From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com>
Date: Thu, 17 Dec 2020 03:18:50 +0100
Subject: [PATCH 07/17] DOC: Add doc-string examples for pd.read_sql using
 custom parse_dates arg values (#38475)

---
 ci/code_checks.sh |  4 ++++
 pandas/io/sql.py  | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3eeee61f62a7e..d2f20a91cc654 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
     pytest -q --doctest-modules pandas/core/strings/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Doctests sql.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/io/sql.py
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     # Directories
 
     MSG='Doctests arrays'; echo $MSG
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index b7efb4a8d6947..23f992ceb009a 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -482,6 +482,64 @@ def read_sql(
     --------
     read_sql_table : Read SQL database table into a DataFrame.
     read_sql_query : Read SQL query into a DataFrame.
+
+    Examples
+    --------
+    Read data from SQL via either a SQL query or a SQL tablename.
+    When using a SQLite database only SQL queries are accepted,
+    providing only the SQL tablename will result in an error.
+
+    >>> from sqlite3 import connect
+    >>> conn = connect(':memory:')
+    >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']],
+    ...                   columns=['int_column', 'date_column'])
+    >>> df.to_sql('test_data', conn)
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn)
+       int_column date_column
+    0           0    10/11/12
+    1           1    12/11/10
+
+    >>> pd.read_sql('test_data', 'postgres:///db_name')  # doctest:+SKIP
+
+    Apply date parsing to columns through the ``parse_dates`` argument
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates=["date_column"])
+       int_column date_column
+    0           0  2012-10-11
+    1           1  2010-12-11
+
+    The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
+    Custom argument values for applying ``pd.to_datetime`` on a column are specified
+    via a dictionary format:
+    1. Ignore errors while parsing the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"errors": "ignore"}})
+       int_column date_column
+    0           0  2012-10-11
+    1           1  2010-12-11
+
+    2. Apply a dayfirst date parsing order on the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"dayfirst": True}})
+       int_column date_column
+    0           0  2012-11-10
+    1           1  2010-11-12
+
+    3. Apply custom formatting when date parsing the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"format": "%d/%m/%y"}})
+       int_column date_column
+    0           0  2012-11-10
+    1           1  2010-11-12
     """
     pandas_sql = pandasSQL_builder(con)
 

From d08f12c0409e8de977ae1821dbfc583942f35bef Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 05:46:45 -0800
Subject: [PATCH 08/17] REG: DataFrame.shift with axis=1 and CategoricalIndex
 columns (#38504)

---
 doc/source/whatsnew/v1.3.0.rst           |  1 -
 pandas/core/frame.py                     |  7 +++++--
 pandas/tests/frame/methods/test_shift.py | 24 +++++++++++++++++++++++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index af96269019ca4..990c87eab5a8d 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -195,7 +195,6 @@ Numeric
 ^^^^^^^
 - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
 - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
--
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1bf40f782f666..86a40f0845fd9 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4586,20 +4586,23 @@ def shift(
         if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0:
             # We will infer fill_value to match the closest column
 
+            # Use a column that we know is valid for our column's dtype GH#38434
+            label = self.columns[0]
+
             if periods > 0:
                 result = self.iloc[:, :-periods]
                 for col in range(min(ncols, abs(periods))):
                     # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
                     # Define filler inside loop so we get a copy
                     filler = self.iloc[:, 0].shift(len(self))
-                    result.insert(0, col, filler, allow_duplicates=True)
+                    result.insert(0, label, filler, allow_duplicates=True)
             else:
                 result = self.iloc[:, -periods:]
                 for col in range(min(ncols, abs(periods))):
                     # Define filler inside loop so we get a copy
                     filler = self.iloc[:, -1].shift(len(self))
                     result.insert(
-                        len(result.columns), col, filler, allow_duplicates=True
+                        len(result.columns), label, filler, allow_duplicates=True
                     )
 
             result.columns = self.columns.copy()
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
index 2e21ce8ec2256..40b3f1e89c015 100644
--- a/pandas/tests/frame/methods/test_shift.py
+++ b/pandas/tests/frame/methods/test_shift.py
@@ -2,7 +2,7 @@
 import pytest
 
 import pandas as pd
-from pandas import DataFrame, Index, Series, date_range, offsets
+from pandas import CategoricalIndex, DataFrame, Index, Series, date_range, offsets
 import pandas._testing as tm
 
 
@@ -292,3 +292,25 @@ def test_shift_dt64values_int_fill_deprecated(self):
 
         expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]})
         tm.assert_frame_equal(result, expected)
+
+    def test_shift_axis1_categorical_columns(self):
+        # GH#38434
+        ci = CategoricalIndex(["a", "b", "c"])
+        df = DataFrame(
+            {"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci
+        )
+        result = df.shift(axis=1)
+
+        expected = DataFrame(
+            {"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # periods != 1
+        result = df.shift(2, axis=1)
+        expected = DataFrame(
+            {"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]},
+            index=ci[:-1],
+            columns=ci,
+        )
+        tm.assert_frame_equal(result, expected)

From bffc7ad515ba812fc780d5a8e2a7af450ec95e9a Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 05:50:58 -0800
Subject: [PATCH 09/17] BUG: Make DTI/TDI/PI argsort match their underlying
 arrays (#37965)

---
 pandas/core/groupby/grouper.py       | 5 ++++-
 pandas/core/indexes/base.py          | 4 ----
 pandas/tests/indexes/datetimelike.py | 8 ++++++++
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index d814a7cee436e..8267cdeb77517 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -373,7 +373,10 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
         # possibly sort
         if (self.sort or sort) and not ax.is_monotonic:
             # use stable sort to support first, last, nth
-            indexer = self.indexer = ax.argsort(kind="mergesort")
+            # TODO: why does putting na_position="first" fix datetimelike cases?
+            indexer = self.indexer = ax.array.argsort(
+                kind="mergesort", na_position="first"
+            )
             ax = ax.take(indexer)
             obj = obj.take(indexer, axis=self.axis)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 2101893d39dc9..f757f41a96fa5 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4767,10 +4767,6 @@ def argsort(self, *args, **kwargs) -> np.ndarray:
         >>> idx[order]
         Index(['a', 'b', 'c', 'd'], dtype='object')
         """
-        if needs_i8_conversion(self.dtype):
-            # TODO: these do not match the underlying EA argsort methods GH#37863
-            return self.asi8.argsort(*args, **kwargs)
-
         # This works for either ndarray or EA, is overriden
         #  by RangeIndex, MultIIndex
         return self._data.argsort(*args, **kwargs)
diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py
index 14f9c2f9de284..c128f4ab6b7dd 100644
--- a/pandas/tests/indexes/datetimelike.py
+++ b/pandas/tests/indexes/datetimelike.py
@@ -10,6 +10,14 @@
 
 
 class DatetimeLike(Base):
+    def test_argsort_matches_array(self):
+        rng = self.create_index()
+        rng = rng.insert(1, pd.NaT)
+
+        result = rng.argsort()
+        expected = rng._data.argsort()
+        tm.assert_numpy_array_equal(result, expected)
+
     def test_can_hold_identifiers(self):
         idx = self.create_index()
         key = idx[0]

From baeacad24a417fbf880f11fac578f9cb5216711d Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 05:52:43 -0800
Subject: [PATCH 10/17] ENH: support 2D in DatetimeArray._from_sequence
 (#38021)

---
 pandas/core/arrays/datetimes.py       |  8 ++++++--
 pandas/tests/arrays/test_datetimes.py | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 8c94a1a080dca..5fdfa62c393c4 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -2071,20 +2071,24 @@ def objects_to_datetime64ns(
     # if str-dtype, convert
     data = np.array(data, copy=False, dtype=np.object_)
 
+    flags = data.flags
+    order = "F" if flags.f_contiguous else "C"
     try:
         result, tz_parsed = tslib.array_to_datetime(
-            data,
+            data.ravel("K"),
             errors=errors,
             utc=utc,
             dayfirst=dayfirst,
             yearfirst=yearfirst,
             require_iso8601=require_iso8601,
         )
+        result = result.reshape(data.shape, order=order)
     except ValueError as e:
         try:
-            values, tz_parsed = conversion.datetime_to_datetime64(data)
+            values, tz_parsed = conversion.datetime_to_datetime64(data.ravel("K"))
             # If tzaware, these values represent unix timestamps, so we
             #  return them as i8 to distinguish from wall times
+            values = values.reshape(data.shape, order=order)
             return values.view("i8"), tz_parsed
         except (ValueError, TypeError):
             raise e
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
index 1d8ee9cf2b73b..4addc0536848f 100644
--- a/pandas/tests/arrays/test_datetimes.py
+++ b/pandas/tests/arrays/test_datetimes.py
@@ -465,6 +465,24 @@ def test_tz_dtype_matches(self):
         result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central"))
         tm.assert_numpy_array_equal(arr._data, result)
 
+    @pytest.mark.parametrize("order", ["F", "C"])
+    def test_2d(self, order):
+        dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
+        arr = np.array(dti, dtype=object).reshape(3, 2)
+        if order == "F":
+            arr = arr.T
+
+        res = sequence_to_dt64ns(arr)
+        expected = sequence_to_dt64ns(arr.ravel())
+
+        tm.assert_numpy_array_equal(res[0].ravel(), expected[0])
+        assert res[1] == expected[1]
+        assert res[2] == expected[2]
+
+        res = DatetimeArray._from_sequence(arr)
+        expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
+        tm.assert_datetime_array_equal(res, expected)
+
 
 class TestReductions:
     @pytest.fixture

From 9ee8674a9fb593f138e66d7b108a097beaaab7f2 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 05:53:36 -0800
Subject: [PATCH 11/17] REF: avoid catching all exceptions in libreduction
 (#38285)

---
 pandas/_libs/reduction.pyx | 35 +++++++++++++++++++++++++----------
 pandas/core/groupby/ops.py | 11 ++++-------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
index 4b6b71088cb7c..25b41b020aee6 100644
--- a/pandas/_libs/reduction.pyx
+++ b/pandas/_libs/reduction.pyx
@@ -335,10 +335,6 @@ cdef class Slider:
         self.buf.shape[0] = 0
 
 
-class InvalidApply(Exception):
-    pass
-
-
 def apply_frame_axis0(object frame, object f, object names,
                       const int64_t[:] starts, const int64_t[:] ends):
     cdef:
@@ -365,11 +361,7 @@ def apply_frame_axis0(object frame, object f, object names,
             chunk = slider.dummy
             object.__setattr__(chunk, 'name', names[i])
 
-            try:
-                piece = f(chunk)
-            except Exception as err:
-                # We can't be more specific without knowing something about `f`
-                raise InvalidApply("Let this error raise above us") from err
+            piece = f(chunk)
 
             # Need to infer if low level index slider will cause segfaults
             require_slow_apply = i == 0 and piece is chunk
@@ -406,7 +398,8 @@ cdef class BlockSlider:
     """
     cdef:
         object frame, dummy, index, block
-        list blk_values
+        list blocks, blk_values
+        ndarray orig_blklocs, orig_blknos
         ndarray values
         Slider idx_slider
         char **base_ptrs
@@ -418,6 +411,13 @@ cdef class BlockSlider:
         self.dummy = frame[:0]
         self.index = self.dummy.index
 
+        # GH#35417 attributes we need to restore at each step in case
+        #  the function modified them.
+        mgr = self.dummy._mgr
+        self.orig_blklocs = mgr.blklocs
+        self.orig_blknos = mgr.blknos
+        self.blocks = [x for x in self.dummy._mgr.blocks]
+
         self.blk_values = [block.values for block in self.dummy._mgr.blocks]
 
         for values in self.blk_values:
@@ -441,6 +441,9 @@ cdef class BlockSlider:
         cdef:
             ndarray arr
             Py_ssize_t i
+
+        self._restore_blocks()
+
         # move blocks
         for i in range(self.nblocks):
             arr = self.blk_values[i]
@@ -460,9 +463,21 @@ cdef class BlockSlider:
         cdef:
             ndarray arr
             Py_ssize_t i
+
+        self._restore_blocks()
+
         for i in range(self.nblocks):
             arr = self.blk_values[i]
 
             # axis=1 is the frame's axis=0
             arr.data = self.base_ptrs[i]
             arr.shape[1] = 0
+
+    cdef _restore_blocks(self):
+        """
+        Ensure that we have the original blocks, blknos, and blklocs.
+        """
+        mgr = self.dummy._mgr
+        mgr.blocks = self.blocks
+        mgr._blklocs = self.orig_blklocs
+        mgr._blknos = self.orig_blknos
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7724e3930f7df..d1a4fc6fc74e5 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -202,13 +202,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
             try:
                 result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
 
-            except libreduction.InvalidApply as err:
-                # This Exception is raised if `f` triggers an exception
-                # but it is preferable to raise the exception in Python.
-                if "Let this error raise above us" not in str(err):
-                    # TODO: can we infer anything about whether this is
-                    #  worth-retrying in pure-python?
-                    raise
+            except IndexError:
+                # This is a rare case in which re-running in python-space may
+                #  make a difference, see  test_apply_mutate.test_mutate_groups
+                pass
 
             else:
                 # If the fast apply path could be used we can return here.

From fbe71622d338e702fbe442f78714d991ee8dfd09 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 07:06:34 -0800
Subject: [PATCH 12/17] CLN: share .values (#38531)

---
 pandas/_testing.py              |  4 +++-
 pandas/core/indexes/base.py     | 10 +++++-----
 pandas/core/indexes/category.py |  5 -----
 pandas/core/indexes/interval.py |  7 -------
 4 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/pandas/_testing.py b/pandas/_testing.py
index 73b1dcf31979f..964c8d4d3d61a 100644
--- a/pandas/_testing.py
+++ b/pandas/_testing.py
@@ -834,7 +834,9 @@ def _get_ilevel_values(index, level):
     # skip exact index checking when `check_categorical` is False
     if check_exact and check_categorical:
         if not left.equals(right):
-            diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left)
+            diff = (
+                np.sum((left._values != right._values).astype(int)) * 100.0 / len(left)
+            )
             msg = f"{obj} values are different ({np.round(diff, 5)} %)"
             raise_assert_detail(obj, msg, left, right)
     else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index f757f41a96fa5..8d38e7f173594 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -27,7 +27,7 @@
 from pandas._libs.lib import is_datetime_array, no_default
 from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp
 from pandas._libs.tslibs.timezones import tz_compare
-from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final
+from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Label, Shape, final
 from pandas.compat.numpy import function as nv
 from pandas.errors import DuplicateLabelError, InvalidIndexError
 from pandas.util._decorators import Appender, cache_readonly, doc
@@ -1164,7 +1164,7 @@ def to_series(self, index=None, name=None):
         if name is None:
             name = self.name
 
-        return Series(self.values.copy(), index=index, name=name)
+        return Series(self._values.copy(), index=index, name=name)
 
     def to_frame(self, index: bool = True, name=None):
         """
@@ -4036,7 +4036,7 @@ def _wrap_joined_index(
     # Uncategorized Methods
 
     @property
-    def values(self) -> np.ndarray:
+    def values(self) -> ArrayLike:
         """
         Return an array representing the data in the Index.
 
@@ -4055,7 +4055,7 @@ def values(self) -> np.ndarray:
         Index.array : Reference to the underlying data.
         Index.to_numpy : A NumPy array representing the underlying data.
         """
-        return self._data.view(np.ndarray)
+        return self._data
 
     @cache_readonly
     @doc(IndexOpsMixin.array)
@@ -5318,7 +5318,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind):
         # wish to have special treatment for floats/ints, e.g. Float64Index and
         # datetimelike Indexes
         # reject them, if index does not contain label
-        if (is_float(label) or is_integer(label)) and label not in self.values:
+        if (is_float(label) or is_integer(label)) and label not in self._values:
             raise self._invalid_indexer("slice", label)
 
         return label
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 7c826000d035a..588ce0a4931ba 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -357,11 +357,6 @@ def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[st
     def inferred_type(self) -> str:
         return "categorical"
 
-    @property
-    def values(self):
-        """ return the underlying data, which is a Categorical """
-        return self._data
-
     @doc(Index.__contains__)
     def __contains__(self, key: Any) -> bool:
         # if key is a NaN, check if any NaN is in self.
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 23363e2c6e32a..1416f3afd60b3 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -348,13 +348,6 @@ def __contains__(self, key: Any) -> bool:
     def _multiindex(self) -> MultiIndex:
         return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"])
 
-    @cache_readonly
-    def values(self) -> IntervalArray:
-        """
-        Return the IntervalIndex's data as an IntervalArray.
-        """
-        return self._data
-
     def __array_wrap__(self, result, context=None):
         # we don't want the superclass implementation
         return result

From 8fd2d0c1eea04d56ec0a63fae084a66dd482003e Mon Sep 17 00:00:00 2001
From: aflah02 <72096386+aflah02@users.noreply.github.com>
Date: Thu, 17 Dec 2020 22:24:23 +0530
Subject: [PATCH 13/17] Added Documentation to specify that DataFrame.last()
 needs the index to be sorted to deliver the expected results (#38536)

* Update generic.py

Added Documentation mentioning that DataFrame.last() needs the index to be sorted to deliver the expected results

* Update generic.py

Fixed PEP8 Issues

* Update generic.py

Fixed PEP 8  Issues

* Update generic.py

As per recommendation changed the description for DataFrame.last() making it more concise

* Update generic.py

Removed trailing whitespace to fix PEP 8 Issues
---
 pandas/core/generic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index f9aa5ca9e8ea9..9b0c3caa0b407 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8439,8 +8439,8 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries:
         """
         Select final periods of time series data based on a date offset.
 
-        When having a DataFrame with dates as index, this function can
-        select the last few rows based on a date offset.
+        For a DataFrame with a sorted DatetimeIndex, this function
+        selects the last few rows based on a date offset.
 
         Parameters
         ----------

From f197ca5d1d552a532e359d43d18cd420a2be5069 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 09:46:10 -0800
Subject: [PATCH 14/17] ENH: 2D compat for DTA tz_localize, to_period (#37950)

---
 pandas/core/arrays/_mixins.py            | 21 +++++++++++++++++++++
 pandas/core/arrays/datetimelike.py       | 11 +++++++++--
 pandas/core/arrays/datetimes.py          |  7 +++++--
 pandas/core/arrays/period.py             |  1 +
 pandas/core/arrays/timedeltas.py         |  3 ++-
 pandas/tests/arrays/test_datetimelike.py |  9 +++++++++
 pandas/tests/arrays/test_datetimes.py    | 11 +++++++++++
 7 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 02214ff51b02a..b6938931e86af 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from functools import wraps
 from typing import Any, Optional, Sequence, Type, TypeVar, Union
 
 import numpy as np
@@ -27,6 +28,26 @@
 )
 
 
+def ravel_compat(meth):
+    """
+    Decorator to ravel a 2D array before passing it to a cython operation,
+    then reshape the result to our own shape.
+    """
+
+    @wraps(meth)
+    def method(self, *args, **kwargs):
+        if self.ndim == 1:
+            return meth(self, *args, **kwargs)
+
+        flags = self._ndarray.flags
+        flat = self.ravel("K")
+        result = meth(flat, *args, **kwargs)
+        order = "F" if flags.f_contiguous else "C"
+        return result.reshape(self.shape, order=order)
+
+    return method
+
+
 class NDArrayBackedExtensionArray(ExtensionArray):
     """
     ExtensionArray that is backed by a single NumPy ndarray.
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index be9864731842d..ee1323b71f146 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -64,7 +64,7 @@
 from pandas.core import nanops, ops
 from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts
 from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
+from pandas.core.arrays._mixins import NDArrayBackedExtensionArray, ravel_compat
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array
 from pandas.core.indexers import check_array_indexer, check_setitem_lengths
@@ -679,6 +679,9 @@ def value_counts(self, dropna: bool = False):
         -------
         Series
         """
+        if self.ndim != 1:
+            raise NotImplementedError
+
         from pandas import Index, Series
 
         if dropna:
@@ -694,6 +697,7 @@ def value_counts(self, dropna: bool = False):
         )
         return Series(result._values, index=index, name=result.name)
 
+    @ravel_compat
     def map(self, mapper):
         # TODO(GH-23179): Add ExtensionArray.map
         # Need to figure out if we want ExtensionArray.map first.
@@ -820,6 +824,9 @@ def freq(self, value):
             value = to_offset(value)
             self._validate_frequency(self, value)
 
+            if self.ndim > 1:
+                raise ValueError("Cannot set freq with ndim > 1")
+
         self._freq = value
 
     @property
@@ -918,7 +925,7 @@ def _is_monotonic_decreasing(self) -> bool:
 
     @property
     def _is_unique(self) -> bool:
-        return len(unique1d(self.asi8)) == len(self)
+        return len(unique1d(self.asi8.ravel("K"))) == self.size
 
     # ------------------------------------------------------------------
     # Arithmetic Methods
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 5fdfa62c393c4..b072ac3cec52e 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -612,14 +612,15 @@ def astype(self, dtype, copy=True):
     # -----------------------------------------------------------------
     # Rendering Methods
 
+    @dtl.ravel_compat
     def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
         from pandas.io.formats.format import get_format_datetime64_from_values
 
         fmt = get_format_datetime64_from_values(self, date_format)
 
         return tslib.format_array_from_datetime(
-            self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep
-        ).reshape(self.shape)
+            self.asi8, tz=self.tz, format=fmt, na_rep=na_rep
+        )
 
     # -----------------------------------------------------------------
     # Comparison Methods
@@ -819,6 +820,7 @@ def tz_convert(self, tz):
         dtype = tz_to_dtype(tz)
         return self._simple_new(self.asi8, dtype=dtype, freq=self.freq)
 
+    @dtl.ravel_compat
     def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"):
         """
         Localize tz-naive Datetime Array/Index to tz-aware
@@ -1051,6 +1053,7 @@ def normalize(self):
         new_values = normalize_i8_timestamps(self.asi8, self.tz)
         return type(self)(new_values)._with_freq("infer").tz_localize(self.tz)
 
+    @dtl.ravel_compat
     def to_period(self, freq=None):
         """
         Cast to PeriodArray/Index at a particular frequency.
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index 257baf20ce911..40dd475e6b6f2 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -562,6 +562,7 @@ def _formatter(self, boxed: bool = False):
             return str
         return "'{}'".format
 
+    @dtl.ravel_compat
     def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
         """
         actually format my specific types
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 93c9567380f7f..fe4eaa4b4bf19 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -400,11 +400,12 @@ def _formatter(self, boxed=False):
 
         return get_format_timedelta64(self, box=True)
 
+    @dtl.ravel_compat
     def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
         from pandas.io.formats.format import get_format_timedelta64
 
         formatter = get_format_timedelta64(self._data, na_rep)
-        return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape)
+        return np.array([formatter(x) for x in self._data])
 
     # ----------------------------------------------------------------
     # Arithmetic Methods
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index c489aa5867632..7c093ebe00959 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -720,6 +720,15 @@ def test_to_period(self, datetime_index, freqstr):
         #  an EA-specific tm.assert_ function
         tm.assert_index_equal(pd.Index(result), pd.Index(expected))
 
+    def test_to_period_2d(self, arr1d):
+        arr2d = arr1d.reshape(1, -1)
+
+        warn = None if arr1d.tz is None else UserWarning
+        with tm.assert_produces_warning(warn):
+            result = arr2d.to_period("D")
+            expected = arr1d.to_period("D").reshape(1, -1)
+        tm.assert_period_array_equal(result, expected)
+
     @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops)
     def test_bool_properties(self, arr1d, propname):
         # in this case _bool_ops is just `is_leap_year`
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
index 4addc0536848f..c8db0157ba219 100644
--- a/pandas/tests/arrays/test_datetimes.py
+++ b/pandas/tests/arrays/test_datetimes.py
@@ -449,6 +449,17 @@ def test_shift_requires_tzmatch(self):
         with pytest.raises(ValueError, match=msg):
             dta.shift(1, fill_value=fill_value)
 
+    def test_tz_localize_t2d(self):
+        dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")
+        dta = dti._data.reshape(3, 4)
+        result = dta.tz_localize(None)
+
+        expected = dta.ravel().tz_localize(None).reshape(dta.shape)
+        tm.assert_datetime_array_equal(result, expected)
+
+        roundtrip = expected.tz_localize("US/Pacific")
+        tm.assert_datetime_array_equal(roundtrip, dta)
+
 
 class TestSequenceToDT64NS:
     def test_tz_dtype_mismatch_raises(self):

From 76a5a4f55f0a7102b177d7e4a6f426d69bac0591 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 17 Dec 2020 13:05:27 -0800
Subject: [PATCH 15/17] CLN: dont consolidate in reshape.concat (#34683)

---
 pandas/core/internals/blocks.py | 45 ++++++++++++++++++++-------------
 pandas/core/internals/concat.py | 11 ++++++--
 pandas/core/reshape/concat.py   |  8 +++---
 3 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 2630c07814bb2..59301391a7dad 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -6,7 +6,16 @@
 
 import numpy as np
 
-from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers
+from pandas._libs import (
+    Interval,
+    NaT,
+    Period,
+    Timestamp,
+    algos as libalgos,
+    internals as libinternals,
+    lib,
+    writers,
+)
 from pandas._libs.internals import BlockPlacement
 from pandas._libs.tslibs import conversion
 from pandas._libs.tslibs.timezones import tz_compare
@@ -41,17 +50,15 @@
     is_float_dtype,
     is_integer,
     is_integer_dtype,
-    is_interval_dtype,
     is_list_like,
     is_object_dtype,
-    is_period_dtype,
     is_re,
     is_re_compilable,
     is_sparse,
     is_timedelta64_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCPandasArray, ABCSeries
 from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat
 
@@ -2629,36 +2636,38 @@ def get_block_type(values, dtype=None):
     -------
     cls : class, subclass of Block
     """
+    # We use vtype and kind checks because they are much more performant
+    #  than is_foo_dtype
     dtype = dtype or values.dtype
     vtype = dtype.type
+    kind = dtype.kind
 
     cls: Type[Block]
 
     if is_sparse(dtype):
         # Need this first(ish) so that Sparse[datetime] is sparse
         cls = ExtensionBlock
-    elif is_categorical_dtype(values.dtype):
+    elif isinstance(dtype, CategoricalDtype):
         cls = CategoricalBlock
-    elif issubclass(vtype, np.datetime64):
-        assert not is_datetime64tz_dtype(values.dtype)
-        cls = DatetimeBlock
-    elif is_datetime64tz_dtype(values.dtype):
+    elif vtype is Timestamp:
         cls = DatetimeTZBlock
-    elif is_interval_dtype(dtype) or is_period_dtype(dtype):
+    elif vtype is Interval or vtype is Period:
         cls = ObjectValuesExtensionBlock
-    elif is_extension_array_dtype(values.dtype):
+    elif isinstance(dtype, ExtensionDtype):
         # Note: need to be sure PandasArray is unwrapped before we get here
         cls = ExtensionBlock
-    elif issubclass(vtype, np.floating):
-        cls = FloatBlock
-    elif issubclass(vtype, np.timedelta64):
-        assert issubclass(vtype, np.integer)
+
+    elif kind == "M":
+        cls = DatetimeBlock
+    elif kind == "m":
         cls = TimeDeltaBlock
-    elif issubclass(vtype, np.complexfloating):
+    elif kind == "f":
+        cls = FloatBlock
+    elif kind == "c":
         cls = ComplexBlock
-    elif issubclass(vtype, np.integer):
+    elif kind == "i" or kind == "u":
         cls = IntBlock
-    elif dtype == np.bool_:
+    elif kind == "b":
         cls = BoolBlock
     else:
         cls = ObjectBlock
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 06de1972b4c9a..dd3a04ccb38e2 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -70,14 +70,21 @@ def concatenate_block_managers(
             vals = [ju.block.values for ju in join_units]
 
             if not blk.is_extension:
-                values = concat_compat(vals, axis=blk.ndim - 1)
+                # _is_uniform_join_units ensures a single dtype, so
+                #  we can use np.concatenate, which is more performant
+                #  than concat_compat
+                values = np.concatenate(vals, axis=blk.ndim - 1)
             else:
                 # TODO(EA2D): special-casing not needed with 2D EAs
                 values = concat_compat(vals)
                 if not isinstance(values, ExtensionArray):
                     values = values.reshape(1, len(values))
 
-            b = make_block(values, placement=placement, ndim=blk.ndim)
+            if blk.values.dtype == values.dtype:
+                # Fast-path
+                b = blk.make_block_same_class(values, placement=placement)
+            else:
+                b = make_block(values, placement=placement, ndim=blk.ndim)
         else:
             b = make_block(
                 _concatenate_join_units(join_units, concat_axis, copy=copy),
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 4a2629daf63d7..42b541bd4cb02 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 from pandas._typing import FrameOrSeriesUnion, Label
+from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.concat import concat_compat
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
@@ -360,7 +361,7 @@ def __init__(
         if len(objs) == 0:
             raise ValueError("All objects passed were None")
 
-        # consolidate data & figure out what our result ndim is going to be
+        # figure out what our result ndim is going to be
         ndims = set()
         for obj in objs:
             if not isinstance(obj, (ABCSeries, ABCDataFrame)):
@@ -370,8 +371,6 @@ def __init__(
                 )
                 raise TypeError(msg)
 
-            # consolidate
-            obj._consolidate_inplace()
             ndims.add(obj.ndim)
 
         # get the sample
@@ -543,7 +542,7 @@ def _get_result_dim(self) -> int:
     def _get_new_axes(self) -> List[Index]:
         ndim = self._get_result_dim()
         return [
-            self._get_concat_axis() if i == self.bm_axis else self._get_comb_axis(i)
+            self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
             for i in range(ndim)
         ]
 
@@ -557,6 +556,7 @@ def _get_comb_axis(self, i: int) -> Index:
             copy=self.copy,
         )
 
+    @cache_readonly
     def _get_concat_axis(self) -> Index:
         """
         Return index to be used along concatenation axis.

From fde8d33db1bdf0bb6caff0af95ba35cc6933f3e8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Thu, 17 Dec 2020 14:45:03 -0800
Subject: [PATCH 16/17] BENCH: Increase sample of
 CategoricalIndexIndexing.time_get_indexer_list benchmark (#38545)

---
 asv_bench/benchmarks/indexing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 38d1f64bd5f4e..e95e5bec5849c 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -3,6 +3,7 @@
 lower-level methods directly on Index and subclasses, see index_object.py,
 indexing_engine.py, and index_cached.py
 """
+import itertools
 import string
 import warnings
 
@@ -256,7 +257,9 @@ def setup(self, index):
             "non_monotonic": CategoricalIndex(list("abc" * N)),
         }
         self.data = indices[index]
-        self.data_unique = CategoricalIndex(list(string.printable))
+        self.data_unique = CategoricalIndex(
+            ["".join(perm) for perm in itertools.permutations(string.printable, 3)]
+        )
 
         self.int_scalar = 10000
         self.int_list = list(range(10000))

From 54682234e3a3e89e246313bf8f9a53f98b199e7b Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 18 Dec 2020 07:40:04 +0800
Subject: [PATCH 17/17] BUG: CategoricalIndex.reindex fails when Index passed
 with labels all in category (#38492)

---
 doc/source/whatsnew/v1.3.0.rst                |  2 +-
 pandas/core/indexes/category.py               |  2 +-
 .../tests/indexes/categorical/test_reindex.py | 34 ++++++++++++++++++-
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 990c87eab5a8d..3545dd8a89159 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -170,7 +170,7 @@ Bug fixes
 Categorical
 ^^^^^^^^^^^
 
--
+- Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`)
 -
 
 Datetimelike
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 588ce0a4931ba..76b1c061cc827 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -428,7 +428,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
         if len(missing):
             cats = self.categories.get_indexer(target)
 
-            if (cats == -1).any():
+            if not isinstance(cats, CategoricalIndex) or (cats == -1).any():
                 # coerce to a regular index here!
                 result = Index(np.array(self), name=self.name)
                 new_target, indexer, _ = result._reindex_non_unique(np.array(target))
diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py
index 668c559abd08e..8228c5139ccdd 100644
--- a/pandas/tests/indexes/categorical/test_reindex.py
+++ b/pandas/tests/indexes/categorical/test_reindex.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from pandas import Categorical, CategoricalIndex, Index, Series
+from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
 import pandas._testing as tm
 
 
@@ -59,3 +59,35 @@ def test_reindex_missing_category(self):
         msg = "'fill_value=-1' is not present in this Categorical's categories"
         with pytest.raises(TypeError, match=msg):
             ser.reindex([1, 2, 3, 4, 5], fill_value=-1)
+
+    @pytest.mark.parametrize(
+        "index_df,index_res,index_exp",
+        [
+            (
+                CategoricalIndex([], categories=["A"]),
+                Index(["A"]),
+                Index(["A"]),
+            ),
+            (
+                CategoricalIndex([], categories=["A"]),
+                Index(["B"]),
+                Index(["B"]),
+            ),
+            (
+                CategoricalIndex([], categories=["A"]),
+                CategoricalIndex(["A"]),
+                CategoricalIndex(["A"]),
+            ),
+            (
+                CategoricalIndex([], categories=["A"]),
+                CategoricalIndex(["B"]),
+                CategoricalIndex(["B"]),
+            ),
+        ],
+    )
+    def test_reindex_not_category(self, index_df, index_res, index_exp):
+        # GH: 28690
+        df = DataFrame(index=index_df)
+        result = df.reindex(index=index_res)
+        expected = DataFrame(index=index_exp)
+        tm.assert_frame_equal(result, expected)