From 8a457c725012f805eb5f26b76c8018092c1ea416 Mon Sep 17 00:00:00 2001 From: aflah02 <72096386+aflah02@users.noreply.github.com> Date: Wed, 16 Dec 2020 21:48:40 +0530 Subject: [PATCH 01/17] Updated README (#38491) --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6d1d890c54093..f238e219bd3d8 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -package index](https://pypi.org/project/pandas) and on conda. +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). ```sh # conda @@ -100,15 +100,15 @@ pip install pandas ``` ## Dependencies -- [NumPy](https://www.numpy.org) -- [python-dateutil](https://labix.org/python-dateutil) -- [pytz](https://pythonhosted.org/pytz) +- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org) +- [python-dateutil - Provides powerful extensions to the standard datetime module](https://labix.org/python-dateutil) +- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://pythonhosted.org/pytz) See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies. ## Installation from sources -To install pandas from source you need Cython in addition to the normal -dependencies above. Cython can be installed from pypi: +To install pandas from source you need [Cython](https://cython.org/) in addition to the normal +dependencies above. Cython can be installed from PyPI: ```sh pip install cython @@ -145,7 +145,7 @@ See the full instructions for [installing from source](https://pandas.pydata.org The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable ## Background -Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and +Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and has been under active development since then. ## Getting Help From d210962d0da0f2fa3d0c433122a81ca98958b90b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Dec 2020 11:26:02 -0800 Subject: [PATCH 02/17] CI: pin xlrd<2.0 (#38526) --- ci/deps/azure-37-slow.yaml | 2 +- ci/deps/azure-38-locale.yaml | 2 +- ci/deps/azure-macos-37.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- ci/deps/travis-37-cov.yaml | 2 +- ci/deps/travis-37-locale.yaml | 2 +- ci/deps/travis-38-slow.yaml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index 50fccf86b6340..05b33fa351ac9 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -31,7 +31,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index f879111a32e67..90cd11037e472 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -30,7 +30,7 @@ dependencies: - pytz - scipy - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 31e0ffca81424..0b8aff83fe230 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -26,7 +26,7 @@ dependencies: - python-dateutil==2.7.3 - pytz - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 16b4bd72683b4..ad72b9c8577e9 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -33,7 +33,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pyreadstat diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 449bbd05991bf..08693e02aa8d3 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -31,6 +31,6 @@ dependencies: - pytz - s3fs>=0.4.0 - scipy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index c89b42ef06a2e..b68ff0672888a 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -43,7 +43,7 @@ dependencies: - sqlalchemy - statsmodels - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 4e442b10482a7..60a92c4dfd3c6 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -35,7 +35,7 @@ dependencies: - pytables>=3.5.1 - scipy - xarray=0.12.3 - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/travis-38-slow.yaml index e4b719006a11e..2b4339cf12658 100644 --- a/ci/deps/travis-38-slow.yaml +++ b/ci/deps/travis-38-slow.yaml @@ -30,7 +30,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto From 7043f8fa9d4d97782ec0d0d1a4c3b57573a7fc21 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Dec 2020 14:32:23 -0800 Subject: [PATCH 03/17] REF: use astype_nansafe in Index.astype (#38518) --- pandas/core/indexes/base.py | 20 ++++++++++---------- pandas/core/indexes/numeric.py | 19 ------------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bafb37775cbb1..2101893d39dc9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -33,6 +33,7 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import ( + astype_nansafe, find_common_type, maybe_cast_to_integer_array, maybe_promote, @@ -693,22 +694,21 @@ def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self - elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex - - return CategoricalIndex( - self._values, name=self.name, dtype=dtype, copy=copy + if needs_i8_conversion(dtype) and is_float_dtype(self.dtype): + # We can't put this into astype_nansafe bc astype_nansafe allows + # casting np.nan to NaT + raise TypeError( + f"Cannot convert {type(self).__name__} to dtype {dtype}; integer " + "values are required for conversion" ) - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) - try: - casted = self._values.astype(dtype, copy=copy) - except (TypeError, ValueError) as err: + casted = astype_nansafe(self._values, dtype=dtype, copy=True) + except TypeError as err: raise TypeError( f"Cannot cast {type(self).__name__} to dtype {dtype}" ) from err + return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 91d27d9922aa5..d6f91c9a06739 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,12 +7,10 @@ from pandas._typing import Dtype, DtypeObj, Label from pandas.util._decorators import doc -from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool, is_bool_dtype, is_dtype_equal, - is_extension_array_dtype, is_float, is_float_dtype, is_integer_dtype, @@ -21,8 +19,6 @@ is_scalar, is_signed_integer_dtype, is_unsigned_integer_dtype, - needs_i8_conversion, - pandas_dtype, ) from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna @@ -332,21 +328,6 @@ def inferred_type(self) -> str: """ return "floating" - @doc(Index.astype) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if needs_i8_conversion(dtype): - raise TypeError( - f"Cannot convert Float64Index to dtype {dtype}; integer " - "values are required for conversion" - ) - elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): - # TODO(jreback); this can change once we have an EA Index type - # GH 13149 - arr = astype_nansafe(self._values, dtype=dtype) - return Int64Index(arr, name=self.name) - return super().astype(dtype, copy=copy) - # ---------------------------------------------------------------- # Indexing Methods From 7d8a052ee869ee547d204f53b15a5dc7c6b3f0c3 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 16 Dec 2020 20:23:05 -0500 Subject: [PATCH 04/17] BENCH/REF: parametrize CSV benchmarks on engine (#38442) --- asv_bench/benchmarks/io/csv.py | 95 ++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..24d21ad6a633d 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO, StringIO import random import string @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c", "python"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,8 +164,8 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): @@ -192,10 +192,10 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): fname = "__test__.csv" - params = ([",", "|"], [None, ","]) - param_names = ["sep", "thousands"] + params = ([",", "|"], [None, ","], ["c", "python"]) + param_names = ["sep", "thousands", "engine"] - def setup(self, sep, thousands): + def setup(self, sep, thousands, engine): N = 10000 K = 8 data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) @@ -206,16 +206,19 @@ def setup(self, sep, thousands): df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) - def time_thousands(self, sep, thousands): - read_csv(self.fname, sep=sep, thousands=thousands) + def time_thousands(self, sep, thousands, engine): + read_csv(self.fname, sep=sep, thousands=thousands, engine=engine) class ReadCSVComment(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = ["A,B,C"] + (["1,2,3 # comment"] * 100000) self.StringIO_input = StringIO("\n".join(data)) - def time_comment(self): + def time_comment(self, engine): read_csv( self.data(self.StringIO_input), comment="#", header=None, names=list("abc") ) @@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): ) +class ReadCSVEngine(StringIORewind): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) + + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) + + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] - def setup(self): + def setup(self, engine): N = 100000 group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"] df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc")) df.to_csv(self.fname, index=False) - def time_convert_post(self): - read_csv(self.fname).apply(Categorical) + def time_convert_post(self, engine): + read_csv(self.fname, engine=engine).apply(Categorical) - def time_convert_direct(self): - read_csv(self.fname, dtype="category") + def time_convert_direct(self, engine): + read_csv(self.fname, engine=engine, dtype="category") class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -284,18 +309,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -304,17 +331,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): + def setup(self, do_cache, engine): data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO): chunksize = 20 num_rows = 1000 fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] - def setup(self): + def setup(self, engine): with open(self.fname, "w") as f: for i in range(self.num_rows): f.write(f"{i}\n") - def mem_parser_chunks(self): + def mem_parser_chunks(self, engine): # see gh-24805. - result = read_csv(self.fname, chunksize=self.chunksize) + result = read_csv(self.fname, chunksize=self.chunksize, engine=engine) for _ in result: pass class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], From a66482e129a438f013962db2f6cd778d20be1bba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Dec 2020 17:23:32 -0800 Subject: [PATCH 05/17] CLN: remove CategoricalIndex._engine (#38529) --- pandas/core/indexes/category.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e2a7752cf3f0d..7c826000d035a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -8,7 +8,7 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Label -from pandas.util._decorators import Appender, cache_readonly, doc +from pandas.util._decorators import Appender, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -381,14 +381,6 @@ def fillna(self, value, downcast=None): cat = self._data.fillna(value) return type(self)._simple_new(cat, name=self.name) - @cache_readonly - def _engine(self): - # we are going to look things up with the codes themselves. - # To avoid a reference cycle, bind `codes` to a local variable, so - # `self` is not passed into the lambda. - codes = self.codes - return self._engine_type(lambda: codes, len(self)) - @doc(Index.unique) def unique(self, level=None): if level is not None: From 0556613072fe44b88289d908992174a5b8509019 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 17 Dec 2020 03:12:42 +0100 Subject: [PATCH 06/17] BUG: MultiIndex.equals returning incorrectly True when Indexes contains NaN (#38511) --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/indexes/multi.py | 10 +++++++--- pandas/tests/indexes/multi/test_equivalence.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 57dd1d05a274e..af96269019ca4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -232,7 +232,7 @@ MultiIndex ^^^^^^^^^^ - Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`) -- +- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1edd98e980a2d..78e7a8516178a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3454,13 +3454,17 @@ def equals(self, other: object) -> bool: for i in range(self.nlevels): self_codes = self.codes[i] - self_codes = self_codes[self_codes != -1] + other_codes = other.codes[i] + self_mask = self_codes == -1 + other_mask = other_codes == -1 + if not np.array_equal(self_mask, other_mask): + return False + self_codes = self_codes[~self_mask] self_values = algos.take_nd( np.asarray(self.levels[i]._values), self_codes, allow_fill=False ) - other_codes = other.codes[i] - other_codes = other_codes[other_codes != -1] + other_codes = other_codes[~other_mask] other_values = algos.take_nd( np.asarray(other.levels[i]._values), other_codes, allow_fill=False ) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index c31c2416ff722..bb34760e28d96 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -209,6 +209,16 @@ def test_equals_missing_values(): assert not result +def test_equals_missing_values_differently_sorted(): + # GH#38439 + mi1 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)]) + mi2 = pd.MultiIndex.from_tuples([(np.nan, np.nan), (81.0, np.nan)]) + assert not mi1.equals(mi2) + + mi2 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)]) + assert mi1.equals(mi2) + + def test_is_(): mi = MultiIndex.from_tuples(zip(range(10), range(10))) assert mi.is_(mi) From d4b623361bf18b42c4074d7b5935101514cf128a Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Thu, 17 Dec 2020 03:18:50 +0100 Subject: [PATCH 07/17] DOC: Add doc-string examples for pd.read_sql using custom parse_dates arg values (#38475) --- ci/code_checks.sh | 4 ++++ pandas/io/sql.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3eeee61f62a7e..d2f20a91cc654 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/strings/ RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests sql.py' ; echo $MSG + pytest -q --doctest-modules pandas/io/sql.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + # Directories MSG='Doctests arrays'; echo $MSG diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b7efb4a8d6947..23f992ceb009a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -482,6 +482,64 @@ def read_sql( -------- read_sql_table : Read SQL database table into a DataFrame. read_sql_query : Read SQL query into a DataFrame. + + Examples + -------- + Read data from SQL via either a SQL query or a SQL tablename. + When using a SQLite database only SQL queries are accepted, + providing only the SQL tablename will result in an error. + + >>> from sqlite3 import connect + >>> conn = connect(':memory:') + >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], + ... columns=['int_column', 'date_column']) + >>> df.to_sql('test_data', conn) + + >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn) + int_column date_column + 0 0 10/11/12 + 1 1 12/11/10 + + >>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP + + Apply date parsing to columns through the ``parse_dates`` argument + + >>> pd.read_sql('SELECT int_column, date_column FROM test_data', + ... conn, + ... parse_dates=["date_column"]) + int_column date_column + 0 0 2012-10-11 + 1 1 2010-12-11 + + The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns. + Custom argument values for applying ``pd.to_datetime`` on a column are specified + via a dictionary format: + 1. Ignore errors while parsing the values of "date_column" + + >>> pd.read_sql('SELECT int_column, date_column FROM test_data', + ... conn, + ... parse_dates={"date_column": {"errors": "ignore"}}) + int_column date_column + 0 0 2012-10-11 + 1 1 2010-12-11 + + 2. Apply a dayfirst date parsing order on the values of "date_column" + + >>> pd.read_sql('SELECT int_column, date_column FROM test_data', + ... conn, + ... parse_dates={"date_column": {"dayfirst": True}}) + int_column date_column + 0 0 2012-11-10 + 1 1 2010-11-12 + + 3. Apply custom formatting when date parsing the values of "date_column" + + >>> pd.read_sql('SELECT int_column, date_column FROM test_data', + ... conn, + ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) + int_column date_column + 0 0 2012-11-10 + 1 1 2010-11-12 """ pandas_sql = pandasSQL_builder(con) From d08f12c0409e8de977ae1821dbfc583942f35bef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 05:46:45 -0800 Subject: [PATCH 08/17] REG: DataFrame.shift with axis=1 and CategoricalIndex columns (#38504) --- doc/source/whatsnew/v1.3.0.rst | 1 - pandas/core/frame.py | 7 +++++-- pandas/tests/frame/methods/test_shift.py | 24 +++++++++++++++++++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af96269019ca4..990c87eab5a8d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -195,7 +195,6 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) -- Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1bf40f782f666..86a40f0845fd9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4586,20 +4586,23 @@ def shift( if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: # We will infer fill_value to match the closest column + # Use a column that we know is valid for our column's dtype GH#38434 + label = self.columns[0] + if periods > 0: result = self.iloc[:, :-periods] for col in range(min(ncols, abs(periods))): # TODO(EA2D): doing this in a loop unnecessary with 2D EAs # Define filler inside loop so we get a copy filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, col, filler, allow_duplicates=True) + result.insert(0, label, filler, allow_duplicates=True) else: result = self.iloc[:, -periods:] for col in range(min(ncols, abs(periods))): # Define filler inside loop so we get a copy filler = self.iloc[:, -1].shift(len(self)) result.insert( - len(result.columns), col, filler, allow_duplicates=True + len(result.columns), label, filler, allow_duplicates=True ) result.columns = self.columns.copy() diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 2e21ce8ec2256..40b3f1e89c015 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series, date_range, offsets +from pandas import CategoricalIndex, DataFrame, Index, Series, date_range, offsets import pandas._testing as tm @@ -292,3 +292,25 @@ def test_shift_dt64values_int_fill_deprecated(self): expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]}) tm.assert_frame_equal(result, expected) + + def test_shift_axis1_categorical_columns(self): + # GH#38434 + ci = CategoricalIndex(["a", "b", "c"]) + df = DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci + ) + result = df.shift(axis=1) + + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci + ) + tm.assert_frame_equal(result, expected) + + # periods != 1 + result = df.shift(2, axis=1) + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]}, + index=ci[:-1], + columns=ci, + ) + tm.assert_frame_equal(result, expected) From bffc7ad515ba812fc780d5a8e2a7af450ec95e9a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 05:50:58 -0800 Subject: [PATCH 09/17] BUG: Make DTI/TDI/PI argsort match their underlying arrays (#37965) --- pandas/core/groupby/grouper.py | 5 ++++- pandas/core/indexes/base.py | 4 ---- pandas/tests/indexes/datetimelike.py | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d814a7cee436e..8267cdeb77517 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -373,7 +373,10 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth - indexer = self.indexer = ax.argsort(kind="mergesort") + # TODO: why does putting na_position="first" fix datetimelike cases? + indexer = self.indexer = ax.array.argsort( + kind="mergesort", na_position="first" + ) ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2101893d39dc9..f757f41a96fa5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4767,10 +4767,6 @@ def argsort(self, *args, **kwargs) -> np.ndarray: >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - if needs_i8_conversion(self.dtype): - # TODO: these do not match the underlying EA argsort methods GH#37863 - return self.asi8.argsort(*args, **kwargs) - # This works for either ndarray or EA, is overriden # by RangeIndex, MultIIndex return self._data.argsort(*args, **kwargs) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 14f9c2f9de284..c128f4ab6b7dd 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -10,6 +10,14 @@ class DatetimeLike(Base): + def test_argsort_matches_array(self): + rng = self.create_index() + rng = rng.insert(1, pd.NaT) + + result = rng.argsort() + expected = rng._data.argsort() + tm.assert_numpy_array_equal(result, expected) + def test_can_hold_identifiers(self): idx = self.create_index() key = idx[0] From baeacad24a417fbf880f11fac578f9cb5216711d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 05:52:43 -0800 Subject: [PATCH 10/17] ENH: support 2D in DatetimeArray._from_sequence (#38021) --- pandas/core/arrays/datetimes.py | 8 ++++++-- pandas/tests/arrays/test_datetimes.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8c94a1a080dca..5fdfa62c393c4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2071,20 +2071,24 @@ def objects_to_datetime64ns( # if str-dtype, convert data = np.array(data, copy=False, dtype=np.object_) + flags = data.flags + order = "F" if flags.f_contiguous else "C" try: result, tz_parsed = tslib.array_to_datetime( - data, + data.ravel("K"), errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, ) + result = result.reshape(data.shape, order=order) except ValueError as e: try: - values, tz_parsed = conversion.datetime_to_datetime64(data) + values, tz_parsed = conversion.datetime_to_datetime64(data.ravel("K")) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times + values = values.reshape(data.shape, order=order) return values.view("i8"), tz_parsed except (ValueError, TypeError): raise e diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 1d8ee9cf2b73b..4addc0536848f 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -465,6 +465,24 @@ def test_tz_dtype_matches(self): result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central")) tm.assert_numpy_array_equal(arr._data, result) + @pytest.mark.parametrize("order", ["F", "C"]) + def test_2d(self, order): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + arr = np.array(dti, dtype=object).reshape(3, 2) + if order == "F": + arr = arr.T + + res = sequence_to_dt64ns(arr) + expected = sequence_to_dt64ns(arr.ravel()) + + tm.assert_numpy_array_equal(res[0].ravel(), expected[0]) + assert res[1] == expected[1] + assert res[2] == expected[2] + + res = DatetimeArray._from_sequence(arr) + expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape) + tm.assert_datetime_array_equal(res, expected) + class TestReductions: @pytest.fixture From 9ee8674a9fb593f138e66d7b108a097beaaab7f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 05:53:36 -0800 Subject: [PATCH 11/17] REF: avoid catching all exceptions in libreduction (#38285) --- pandas/_libs/reduction.pyx | 35 +++++++++++++++++++++++++---------- pandas/core/groupby/ops.py | 11 ++++------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 4b6b71088cb7c..25b41b020aee6 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -335,10 +335,6 @@ cdef class Slider: self.buf.shape[0] = 0 -class InvalidApply(Exception): - pass - - def apply_frame_axis0(object frame, object f, object names, const int64_t[:] starts, const int64_t[:] ends): cdef: @@ -365,11 +361,7 @@ def apply_frame_axis0(object frame, object f, object names, chunk = slider.dummy object.__setattr__(chunk, 'name', names[i]) - try: - piece = f(chunk) - except Exception as err: - # We can't be more specific without knowing something about `f` - raise InvalidApply("Let this error raise above us") from err + piece = f(chunk) # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk @@ -406,7 +398,8 @@ cdef class BlockSlider: """ cdef: object frame, dummy, index, block - list blk_values + list blocks, blk_values + ndarray orig_blklocs, orig_blknos ndarray values Slider idx_slider char **base_ptrs @@ -418,6 +411,13 @@ cdef class BlockSlider: self.dummy = frame[:0] self.index = self.dummy.index + # GH#35417 attributes we need to restore at each step in case + # the function modified them. + mgr = self.dummy._mgr + self.orig_blklocs = mgr.blklocs + self.orig_blknos = mgr.blknos + self.blocks = [x for x in self.dummy._mgr.blocks] + self.blk_values = [block.values for block in self.dummy._mgr.blocks] for values in self.blk_values: @@ -441,6 +441,9 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i + + self._restore_blocks() + # move blocks for i in range(self.nblocks): arr = self.blk_values[i] @@ -460,9 +463,21 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i + + self._restore_blocks() + for i in range(self.nblocks): arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] arr.shape[1] = 0 + + cdef _restore_blocks(self): + """ + Ensure that we have the original blocks, blknos, and blklocs. + """ + mgr = self.dummy._mgr + mgr.blocks = self.blocks + mgr._blklocs = self.orig_blklocs + mgr._blknos = self.orig_blknos diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7724e3930f7df..d1a4fc6fc74e5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -202,13 +202,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): try: result_values, mutated = splitter.fast_apply(f, sdata, group_keys) - except libreduction.InvalidApply as err: - # This Exception is raised if `f` triggers an exception - # but it is preferable to raise the exception in Python. - if "Let this error raise above us" not in str(err): - # TODO: can we infer anything about whether this is - # worth-retrying in pure-python? - raise + except IndexError: + # This is a rare case in which re-running in python-space may + # make a difference, see test_apply_mutate.test_mutate_groups + pass else: # If the fast apply path could be used we can return here. From fbe71622d338e702fbe442f78714d991ee8dfd09 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 07:06:34 -0800 Subject: [PATCH 12/17] CLN: share .values (#38531) --- pandas/_testing.py | 4 +++- pandas/core/indexes/base.py | 10 +++++----- pandas/core/indexes/category.py | 5 ----- pandas/core/indexes/interval.py | 7 ------- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 73b1dcf31979f..964c8d4d3d61a 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -834,7 +834,9 @@ def _get_ilevel_values(index, level): # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): - diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + diff = ( + np.sum((left._values != right._values).astype(int)) * 100.0 / len(left) + ) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f757f41a96fa5..8d38e7f173594 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,7 +27,7 @@ from pandas._libs.lib import is_datetime_array, no_default from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final +from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Label, Shape, final from pandas.compat.numpy import function as nv from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -1164,7 +1164,7 @@ def to_series(self, index=None, name=None): if name is None: name = self.name - return Series(self.values.copy(), index=index, name=name) + return Series(self._values.copy(), index=index, name=name) def to_frame(self, index: bool = True, name=None): """ @@ -4036,7 +4036,7 @@ def _wrap_joined_index( # Uncategorized Methods @property - def values(self) -> np.ndarray: + def values(self) -> ArrayLike: """ Return an array representing the data in the Index. @@ -4055,7 +4055,7 @@ def values(self) -> np.ndarray: Index.array : Reference to the underlying data. Index.to_numpy : A NumPy array representing the underlying data. """ - return self._data.view(np.ndarray) + return self._data @cache_readonly @doc(IndexOpsMixin.array) @@ -5318,7 +5318,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind): # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them, if index does not contain label - if (is_float(label) or is_integer(label)) and label not in self.values: + if (is_float(label) or is_integer(label)) and label not in self._values: raise self._invalid_indexer("slice", label) return label diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7c826000d035a..588ce0a4931ba 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -357,11 +357,6 @@ def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[st def inferred_type(self) -> str: return "categorical" - @property - def values(self): - """ return the underlying data, which is a Categorical """ - return self._data - @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 23363e2c6e32a..1416f3afd60b3 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -348,13 +348,6 @@ def __contains__(self, key: Any) -> bool: def _multiindex(self) -> MultiIndex: return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - @cache_readonly - def values(self) -> IntervalArray: - """ - Return the IntervalIndex's data as an IntervalArray. - """ - return self._data - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result From 8fd2d0c1eea04d56ec0a63fae084a66dd482003e Mon Sep 17 00:00:00 2001 From: aflah02 <72096386+aflah02@users.noreply.github.com> Date: Thu, 17 Dec 2020 22:24:23 +0530 Subject: [PATCH 13/17] Added Documentation to specify that DataFrame.last() needs the index to be sorted to deliver the expected results (#38536) * Update generic.py Added Documentation mentioning that DataFrame.last() needs the index to be sorted to deliver the expected results * Update generic.py Fixed PEP8 Issues * Update generic.py Fixed PEP 8 Issues * Update generic.py As per recommendation changed the description for DataFrame.last() making it more concise * Update generic.py Removed trailing whitespace to fix PEP 8 Issues --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f9aa5ca9e8ea9..9b0c3caa0b407 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8439,8 +8439,8 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Select final periods of time series data based on a date offset. - When having a DataFrame with dates as index, this function can - select the last few rows based on a date offset. + For a DataFrame with a sorted DatetimeIndex, this function + selects the last few rows based on a date offset. Parameters ---------- From f197ca5d1d552a532e359d43d18cd420a2be5069 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 09:46:10 -0800 Subject: [PATCH 14/17] ENH: 2D compat for DTA tz_localize, to_period (#37950) --- pandas/core/arrays/_mixins.py | 21 +++++++++++++++++++++ pandas/core/arrays/datetimelike.py | 11 +++++++++-- pandas/core/arrays/datetimes.py | 7 +++++-- pandas/core/arrays/period.py | 1 + pandas/core/arrays/timedeltas.py | 3 ++- pandas/tests/arrays/test_datetimelike.py | 9 +++++++++ pandas/tests/arrays/test_datetimes.py | 11 +++++++++++ 7 files changed, 58 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 02214ff51b02a..b6938931e86af 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import wraps from typing import Any, Optional, Sequence, Type, TypeVar, Union import numpy as np @@ -27,6 +28,26 @@ ) +def ravel_compat(meth): + """ + Decorator to ravel a 2D array before passing it to a cython operation, + then reshape the result to our own shape. + """ + + @wraps(meth) + def method(self, *args, **kwargs): + if self.ndim == 1: + return meth(self, *args, **kwargs) + + flags = self._ndarray.flags + flat = self.ravel("K") + result = meth(flat, *args, **kwargs) + order = "F" if flags.f_contiguous else "C" + return result.reshape(self.shape, order=order) + + return method + + class NDArrayBackedExtensionArray(ExtensionArray): """ ExtensionArray that is backed by a single NumPy ndarray. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index be9864731842d..ee1323b71f146 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -64,7 +64,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray, ravel_compat import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer, check_setitem_lengths @@ -679,6 +679,9 @@ def value_counts(self, dropna: bool = False): ------- Series """ + if self.ndim != 1: + raise NotImplementedError + from pandas import Index, Series if dropna: @@ -694,6 +697,7 @@ def value_counts(self, dropna: bool = False): ) return Series(result._values, index=index, name=result.name) + @ravel_compat def map(self, mapper): # TODO(GH-23179): Add ExtensionArray.map # Need to figure out if we want ExtensionArray.map first. @@ -820,6 +824,9 @@ def freq(self, value): value = to_offset(value) self._validate_frequency(self, value) + if self.ndim > 1: + raise ValueError("Cannot set freq with ndim > 1") + self._freq = value @property @@ -918,7 +925,7 @@ def _is_monotonic_decreasing(self) -> bool: @property def _is_unique(self) -> bool: - return len(unique1d(self.asi8)) == len(self) + return len(unique1d(self.asi8.ravel("K"))) == self.size # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 5fdfa62c393c4..b072ac3cec52e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -612,14 +612,15 @@ def astype(self, dtype, copy=True): # ----------------------------------------------------------------- # Rendering Methods + @dtl.ravel_compat def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep - ).reshape(self.shape) + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) # ----------------------------------------------------------------- # Comparison Methods @@ -819,6 +820,7 @@ def tz_convert(self, tz): dtype = tz_to_dtype(tz) return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) + @dtl.ravel_compat def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): """ Localize tz-naive Datetime Array/Index to tz-aware @@ -1051,6 +1053,7 @@ def normalize(self): new_values = normalize_i8_timestamps(self.asi8, self.tz) return type(self)(new_values)._with_freq("infer").tz_localize(self.tz) + @dtl.ravel_compat def to_period(self, freq=None): """ Cast to PeriodArray/Index at a particular frequency. diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 257baf20ce911..40dd475e6b6f2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -562,6 +562,7 @@ def _formatter(self, boxed: bool = False): return str return "'{}'".format + @dtl.ravel_compat def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 93c9567380f7f..fe4eaa4b4bf19 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -400,11 +400,12 @@ def _formatter(self, boxed=False): return get_format_timedelta64(self, box=True) + @dtl.ravel_compat def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import get_format_timedelta64 formatter = get_format_timedelta64(self._data, na_rep) - return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) + return np.array([formatter(x) for x in self._data]) # ---------------------------------------------------------------- # Arithmetic Methods diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c489aa5867632..7c093ebe00959 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -720,6 +720,15 @@ def test_to_period(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + def test_to_period_2d(self, arr1d): + arr2d = arr1d.reshape(1, -1) + + warn = None if arr1d.tz is None else UserWarning + with tm.assert_produces_warning(warn): + result = arr2d.to_period("D") + expected = arr1d.to_period("D").reshape(1, -1) + tm.assert_period_array_equal(result, expected) + @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops) def test_bool_properties(self, arr1d, propname): # in this case _bool_ops is just `is_leap_year` diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 4addc0536848f..c8db0157ba219 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -449,6 +449,17 @@ def test_shift_requires_tzmatch(self): with pytest.raises(ValueError, match=msg): dta.shift(1, fill_value=fill_value) + def test_tz_localize_t2d(self): + dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific") + dta = dti._data.reshape(3, 4) + result = dta.tz_localize(None) + + expected = dta.ravel().tz_localize(None).reshape(dta.shape) + tm.assert_datetime_array_equal(result, expected) + + roundtrip = expected.tz_localize("US/Pacific") + tm.assert_datetime_array_equal(roundtrip, dta) + class TestSequenceToDT64NS: def test_tz_dtype_mismatch_raises(self): From 76a5a4f55f0a7102b177d7e4a6f426d69bac0591 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Dec 2020 13:05:27 -0800 Subject: [PATCH 15/17] CLN: dont consolidate in reshape.concat (#34683) --- pandas/core/internals/blocks.py | 45 ++++++++++++++++++++------------- pandas/core/internals/concat.py | 11 ++++++-- pandas/core/reshape/concat.py | 8 +++--- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2630c07814bb2..59301391a7dad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,7 +6,16 @@ import numpy as np -from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers +from pandas._libs import ( + Interval, + NaT, + Period, + Timestamp, + algos as libalgos, + internals as libinternals, + lib, + writers, +) from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare @@ -41,17 +50,15 @@ is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_object_dtype, - is_period_dtype, is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat @@ -2629,36 +2636,38 @@ def get_block_type(values, dtype=None): ------- cls : class, subclass of Block """ + # We use vtype and kind checks because they are much more performant + # than is_foo_dtype dtype = dtype or values.dtype vtype = dtype.type + kind = dtype.kind cls: Type[Block] if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock - elif is_categorical_dtype(values.dtype): + elif isinstance(dtype, CategoricalDtype): cls = CategoricalBlock - elif issubclass(vtype, np.datetime64): - assert not is_datetime64tz_dtype(values.dtype) - cls = DatetimeBlock - elif is_datetime64tz_dtype(values.dtype): + elif vtype is Timestamp: cls = DatetimeTZBlock - elif is_interval_dtype(dtype) or is_period_dtype(dtype): + elif vtype is Interval or vtype is Period: cls = ObjectValuesExtensionBlock - elif is_extension_array_dtype(values.dtype): + elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock - elif issubclass(vtype, np.floating): - cls = FloatBlock - elif issubclass(vtype, np.timedelta64): - assert issubclass(vtype, np.integer) + + elif kind == "M": + cls = DatetimeBlock + elif kind == "m": cls = TimeDeltaBlock - elif issubclass(vtype, np.complexfloating): + elif kind == "f": + cls = FloatBlock + elif kind == "c": cls = ComplexBlock - elif issubclass(vtype, np.integer): + elif kind == "i" or kind == "u": cls = IntBlock - elif dtype == np.bool_: + elif kind == "b": cls = BoolBlock else: cls = ObjectBlock diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 06de1972b4c9a..dd3a04ccb38e2 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -70,14 +70,21 @@ def concatenate_block_managers( vals = [ju.block.values for ju in join_units] if not blk.is_extension: - values = concat_compat(vals, axis=blk.ndim - 1) + # _is_uniform_join_units ensures a single dtype, so + # we can use np.concatenate, which is more performant + # than concat_compat + values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) if not isinstance(values, ExtensionArray): values = values.reshape(1, len(values)) - b = make_block(values, placement=placement, ndim=blk.ndim) + if blk.values.dtype == values.dtype: + # Fast-path + b = blk.make_block_same_class(values, placement=placement) + else: + b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4a2629daf63d7..42b541bd4cb02 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -18,6 +18,7 @@ import numpy as np from pandas._typing import FrameOrSeriesUnion, Label +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -360,7 +361,7 @@ def __init__( if len(objs) == 0: raise ValueError("All objects passed were None") - # consolidate data & figure out what our result ndim is going to be + # figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, (ABCSeries, ABCDataFrame)): @@ -370,8 +371,6 @@ def __init__( ) raise TypeError(msg) - # consolidate - obj._consolidate_inplace() ndims.add(obj.ndim) # get the sample @@ -543,7 +542,7 @@ def _get_result_dim(self) -> int: def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() return [ - self._get_concat_axis() if i == self.bm_axis else self._get_comb_axis(i) + self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i) for i in range(ndim) ] @@ -557,6 +556,7 @@ def _get_comb_axis(self, i: int) -> Index: copy=self.copy, ) + @cache_readonly def _get_concat_axis(self) -> Index: """ Return index to be used along concatenation axis. From fde8d33db1bdf0bb6caff0af95ba35cc6933f3e8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 17 Dec 2020 14:45:03 -0800 Subject: [PATCH 16/17] BENCH: Increase sample of CategoricalIndexIndexing.time_get_indexer_list benchmark (#38545) --- asv_bench/benchmarks/indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 38d1f64bd5f4e..e95e5bec5849c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -3,6 +3,7 @@ lower-level methods directly on Index and subclasses, see index_object.py, indexing_engine.py, and index_cached.py """ +import itertools import string import warnings @@ -256,7 +257,9 @@ def setup(self, index): "non_monotonic": CategoricalIndex(list("abc" * N)), } self.data = indices[index] - self.data_unique = CategoricalIndex(list(string.printable)) + self.data_unique = CategoricalIndex( + ["".join(perm) for perm in itertools.permutations(string.printable, 3)] + ) self.int_scalar = 10000 self.int_list = list(range(10000)) From 54682234e3a3e89e246313bf8f9a53f98b199e7b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 18 Dec 2020 07:40:04 +0800 Subject: [PATCH 17/17] BUG: CategoricalIndex.reindex fails when Index passed with labels all in category (#38492) --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/indexes/category.py | 2 +- .../tests/indexes/categorical/test_reindex.py | 34 ++++++++++++++++++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 990c87eab5a8d..3545dd8a89159 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -170,7 +170,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`) - Datetimelike diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 588ce0a4931ba..76b1c061cc827 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -428,7 +428,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): if len(missing): cats = self.categories.get_indexer(target) - if (cats == -1).any(): + if not isinstance(cats, CategoricalIndex) or (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique(np.array(target)) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py index 668c559abd08e..8228c5139ccdd 100644 --- a/pandas/tests/indexes/categorical/test_reindex.py +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import Categorical, CategoricalIndex, Index, Series +from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series import pandas._testing as tm @@ -59,3 +59,35 @@ def test_reindex_missing_category(self): msg = "'fill_value=-1' is not present in this Categorical's categories" with pytest.raises(TypeError, match=msg): ser.reindex([1, 2, 3, 4, 5], fill_value=-1) + + @pytest.mark.parametrize( + "index_df,index_res,index_exp", + [ + ( + CategoricalIndex([], categories=["A"]), + Index(["A"]), + Index(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + Index(["B"]), + Index(["B"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["A"]), + CategoricalIndex(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["B"]), + CategoricalIndex(["B"]), + ), + ], + ) + def test_reindex_not_category(self, index_df, index_res, index_exp): + # GH: 28690 + df = DataFrame(index=index_df) + result = df.reindex(index=index_res) + expected = DataFrame(index=index_exp) + tm.assert_frame_equal(result, expected)