Merge remote-tracking branch 'upstream/master' into bisect

simonjayhawkins · Dec 18, 2020 · bf40fc2 · bf40fc2
2 parents 5ecf905 + 5468223
commit bf40fc2
Show file tree

Hide file tree

Showing 39 changed files with 409 additions and 170 deletions.
diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ The source code is currently hosted on GitHub at:
 https://github.com/pandas-dev/pandas
 
 Binary installers for the latest released version are available at the [Python
-package index](https://pypi.org/project/pandas) and on conda.
+Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/).
 
 ```sh
 # conda
@@ -100,15 +100,15 @@ pip install pandas
 ```
 
 ## Dependencies
-- [NumPy](https://www.numpy.org)
-- [python-dateutil](https://labix.org/python-dateutil)
-- [pytz](https://pythonhosted.org/pytz)
+- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org)
+- [python-dateutil - Provides powerful extensions to the standard datetime module](https://labix.org/python-dateutil)
+- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://pythonhosted.org/pytz)
 
 See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies.
 
 ## Installation from sources
-To install pandas from source you need Cython in addition to the normal
-dependencies above. Cython can be installed from pypi:
+To install pandas from source you need [Cython](https://cython.org/) in addition to the normal
+dependencies above. Cython can be installed from PyPI:
 
 ```sh
 pip install cython
@@ -145,7 +145,7 @@ See the full instructions for [installing from source](https://pandas.pydata.org
 The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable
 
 ## Background
-Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and
+Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and
 has been under active development since then.
 
 ## Getting Help

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -3,6 +3,7 @@
 lower-level methods directly on Index and subclasses, see index_object.py,
 indexing_engine.py, and index_cached.py
 """
+import itertools
 import string
 import warnings
 
@@ -256,7 +257,9 @@ def setup(self, index):
             "non_monotonic": CategoricalIndex(list("abc" * N)),
         }
         self.data = indices[index]
-        self.data_unique = CategoricalIndex(list(string.printable))
+        self.data_unique = CategoricalIndex(
+            ["".join(perm) for perm in itertools.permutations(string.printable, 3)]
+        )
 
         self.int_scalar = 10000
         self.int_list = list(range(10000))

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "python"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
 class ReadCSVThousands(BaseIO):
 
     fname = "__test__.csv"
-    params = ([",", "|"], [None, ","])
-    param_names = ["sep", "thousands"]
+    params = ([",", "|"], [None, ","], ["c", "python"])
+    param_names = ["sep", "thousands", "engine"]
 
-    def setup(self, sep, thousands):
+    def setup(self, sep, thousands, engine):
         N = 10000
         K = 8
         data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
@@ -206,16 +206,19 @@ def setup(self, sep, thousands):
             df = df.applymap(lambda x: fmt.format(x))
         df.to_csv(self.fname, sep=sep)
 
-    def time_thousands(self, sep, thousands):
-        read_csv(self.fname, sep=sep, thousands=thousands)
+    def time_thousands(self, sep, thousands, engine):
+        read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)
 
 
 class ReadCSVComment(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
 
-    def time_comment(self):
+    def time_comment(self, engine):
         read_csv(
             self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
         )
@@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
         )
 
 
+class ReadCSVEngine(StringIORewind):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
+
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
+
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
+
 class ReadCSVCategorical(BaseIO):
 
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         N = 100000
         group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
         df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
         df.to_csv(self.fname, index=False)
 
-    def time_convert_post(self):
-        read_csv(self.fname).apply(Categorical)
+    def time_convert_post(self, engine):
+        read_csv(self.fname, engine=engine).apply(Categorical)
 
-    def time_convert_direct(self):
-        read_csv(self.fname, dtype="category")
+    def time_convert_direct(self, engine):
+        read_csv(self.fname, engine=engine, dtype="category")
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +309,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -304,17 +331,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         with open(self.fname, "w") as f:
             for i in range(self.num_rows):
                 f.write(f"{i}\n")
 
-    def mem_parser_chunks(self):
+    def mem_parser_chunks(self, engine):
         # see gh-24805.
-        result = read_csv(self.fname, chunksize=self.chunksize)
+        result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)
 
         for _ in result:
             pass
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
     pytest -q --doctest-modules pandas/core/strings/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Doctests sql.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/io/sql.py
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     # Directories
 
     MSG='Doctests arrays'; echo $MSG

diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml
@@ -31,7 +31,7 @@ dependencies:
   - moto>=1.3.14
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto

diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml
@@ -30,7 +30,7 @@ dependencies:
   - pytz
   - scipy
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto

diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml
@@ -26,7 +26,7 @@ dependencies:
   - python-dateutil==2.7.3
   - pytz
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pip

diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
@@ -33,7 +33,7 @@ dependencies:
   - s3fs>=0.4.2
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pyreadstat

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -31,6 +31,6 @@ dependencies:
   - pytz
   - s3fs>=0.4.0
   - scipy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml
@@ -43,7 +43,7 @@ dependencies:
   - sqlalchemy
   - statsmodels
   - xarray
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - pip

diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml
@@ -35,7 +35,7 @@ dependencies:
   - pytables>=3.5.1
   - scipy
   - xarray=0.12.3
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto

diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/travis-38-slow.yaml
@@ -30,7 +30,7 @@ dependencies:
   - moto>=1.3.14
   - scipy
   - sqlalchemy
-  - xlrd
+  - xlrd<2.0
   - xlsxwriter
   - xlwt
   - moto

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -170,7 +170,7 @@ Bug fixes
 Categorical
 ^^^^^^^^^^^
 
--
+- Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`)
 -
 
 Datetimelike
@@ -195,7 +195,6 @@ Numeric
 ^^^^^^^
 - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
 - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
--
 
 Conversion
 ^^^^^^^^^^
@@ -232,7 +231,7 @@ MultiIndex
 ^^^^^^^^^^
 
 - Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
--
+- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`)
 
 I/O
 ^^^
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,7 +30,7 @@ dependencies: @@
       - pytz
       - scipy
       - xarray
-      - xlrd
+      - xlrd<2.0
       - xlsxwriter
       - xlwt
       - moto
@@ Expand Down @@