Skip to content

Commit

Permalink
fetch from master and merge
Browse files Browse the repository at this point in the history
  • Loading branch information
suyashgupta01 committed Aug 30, 2021
2 parents 101c0c2 + 40faba2 commit 6e9882d
Show file tree
Hide file tree
Showing 70 changed files with 1,023 additions and 263 deletions.
10 changes: 9 additions & 1 deletion asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,19 +115,27 @@ def time_maybe_convert_objects(self):
class ToDatetimeFromIntsFloats:
def setup(self):
self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64")
self.ts_sec_float = self.ts_sec.astype("float64")

self.ts_nanosec = 1_000_000 * self.ts_sec
self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint
self.ts_nanosec_float = self.ts_nanosec.astype("float64")

# speed of int64 and float64 paths should be comparable
# speed of int64, uint64 and float64 paths should be comparable

def time_nanosec_int64(self):
to_datetime(self.ts_nanosec, unit="ns")

def time_nanosec_uint64(self):
to_datetime(self.ts_nanosec_uint, unit="ns")

def time_nanosec_float64(self):
to_datetime(self.ts_nanosec_float, unit="ns")

def time_sec_uint64(self):
to_datetime(self.ts_sec_uint, unit="s")

def time_sec_int64(self):
to_datetime(self.ts_sec, unit="s")

Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def time_read_csv(self, bad_date_value):
class ReadCSVSkipRows(BaseIO):

fname = "__test__.csv"
params = ([None, 10000], ["c", "python"])
params = ([None, 10000], ["c", "python", "pyarrow"])
param_names = ["skiprows", "engine"]

def setup(self, skiprows, engine):
Expand Down Expand Up @@ -320,7 +320,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):


class ReadCSVEngine(StringIORewind):
params = ["c", "python"]
params = ["c", "python", "pyarrow"]
param_names = ["engine"]

def setup(self, engine):
Expand Down
1 change: 0 additions & 1 deletion ci/deps/actions-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.2
- aiobotocore<=1.3.3
- scipy
- sqlalchemy
- xlrd
Expand Down
1 change: 0 additions & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.0
- aiobotocore<=1.3.3
- scipy
- xlrd
- xlsxwriter
Expand Down
1 change: 0 additions & 1 deletion ci/deps/azure-windows-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.2
- aiobotocore<=1.3.3
- scipy
- sqlalchemy
- xlrd
Expand Down
3 changes: 3 additions & 0 deletions doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ Installing from PyPI
pandas can be installed via pip from
`PyPI <https://pypi.org/project/pandas>`__.

.. note::
You must have ``pip>=19.3`` to install from PyPI.

::

pip install pandas
Expand Down
54 changes: 46 additions & 8 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,15 @@ dtype : Type name or dict of column -> type, default ``None``
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
with suitable ``na_values`` settings to preserve and
not interpret dtype.
engine : {``'c'``, ``'python'``}
Parser engine to use. The C engine is faster while the Python engine is
currently more feature-complete.
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
Parser engine to use. The C and pyarrow engines are faster, while the python engine
is currently more feature-complete. Multithreading is currently only supported by
the pyarrow engine.

.. versionadded:: 1.4.0

The "pyarrow" engine was added as an *experimental* engine, and some features
are unsupported, or may not work correctly, with this engine.
converters : dict, default ``None``
Dict of functions for converting values in certain columns. Keys can either be
integers or column labels.
Expand Down Expand Up @@ -1622,11 +1628,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
Specifying the parser engine
''''''''''''''''''''''''''''

Under the hood pandas uses a fast and efficient parser implemented in C as well
as a Python implementation which is currently more feature-complete. Where
possible pandas uses the C parser (specified as ``engine='c'``), but may fall
back to Python if C-unsupported options are specified. Currently, C-unsupported
options include:
Pandas currently supports three engines, the C engine, the python engine, and an experimental
pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest
on larger workloads and is equivalent in speed to the C engine on most other workloads.
The python engine tends to be slower than the pyarrow and C engines on most workloads. However,
the pyarrow engine is much less robust than the C engine, which lacks a few features compared to the
Python engine.

Where possible, pandas uses the C parser (specified as ``engine='c'``), but it may fall
back to Python if C-unsupported options are specified.

Currently, options unsupported by the C and pyarrow engines include:

* ``sep`` other than a single character (e.g. regex separators)
* ``skipfooter``
Expand All @@ -1635,6 +1647,32 @@ options include:
Specifying any of the above options will produce a ``ParserWarning`` unless the
python engine is selected explicitly using ``engine='python'``.

Options that are unsupported by the pyarrow engine which are not covered by the list above include:

* ``float_precision``
* ``chunksize``
* ``comment``
* ``nrows``
* ``thousands``
* ``memory_map``
* ``dialect``
* ``warn_bad_lines``
* ``error_bad_lines``
* ``on_bad_lines``
* ``delim_whitespace``
* ``quoting``
* ``lineterminator``
* ``converters``
* ``decimal``
* ``iterator``
* ``dayfirst``
* ``infer_datetime_format``
* ``verbose``
* ``skipinitialspace``
* ``low_memory``

Specifying these options with ``engine='pyarrow'`` will raise a ``ValueError``.

.. _io.remote:

Reading/writing remote files
Expand Down
12 changes: 11 additions & 1 deletion doc/source/whatsnew/v1.3.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ Fixed regressions
- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)

.. ---------------------------------------------------------------------------
.. _whatsnew_133.performance:

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement for :meth:`DataFrame.__setitem__` when the key or value is not a :class:`DataFrame`, or key is not list-like (:issue:`43274`)
-
-

.. ---------------------------------------------------------------------------
Expand All @@ -25,7 +35,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
-
- Bug in :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` with ``engine="numba"`` where ``index`` data was not being correctly passed into ``func`` (:issue:`43133`)
-

.. ---------------------------------------------------------------------------
Expand Down
12 changes: 8 additions & 4 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ Styler

There are also bug fixes and deprecations listed below.

.. _whatsnew_140.enhancements.enhancement2:
.. _whatsnew_140.enhancements.pyarrow_csv_engine:

enhancement2
^^^^^^^^^^^^
Multithreaded CSV reading with a new CSV Engine based on pyarrow
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)

.. _whatsnew_140.enhancements.other:

Expand Down Expand Up @@ -242,7 +245,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
- Performance improvement in :func:`read_stata` (:issue:`43059`)
-
- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -322,6 +325,7 @@ MultiIndex
- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
- Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`)
- Bug in :meth:`MultiIndex.putmask` where the other value was also a :class:`MultiIndex` (:issue:`43212`)
-

I/O
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ dependencies:

- pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf
- s3fs>=0.4.0 # file IO when using 's3://...' path
- aiobotocore<=1.3.3 # Remove when s3fs is at 2021.08.0
- aiobotocore
- fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def array_with_unit_to_datetime(
# if we have nulls that are not type-compat
# then need to iterate

if values.dtype.kind == "i" or values.dtype.kind == "f":
if values.dtype.kind in ["i", "f", "u"]:
iresult = values.astype("i8", copy=False)
# fill missing values by comparing to NPY_NAT
mask = iresult == NPY_NAT
Expand All @@ -263,7 +263,7 @@ def array_with_unit_to_datetime(
):
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")

if values.dtype.kind == "i":
if values.dtype.kind in ["i", "u"]:
result = (iresult * m).astype("M8[ns]")

elif values.dtype.kind == "f":
Expand Down
48 changes: 27 additions & 21 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1653,6 +1653,8 @@ def to_numpy(
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
self._consolidate_inplace()
if dtype is not None:
dtype = np.dtype(dtype)
result = self._mgr.as_array(
transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
)
Expand Down Expand Up @@ -3620,9 +3622,11 @@ def __setitem__(self, key, value):
self._setitem_array(key, value)
elif isinstance(value, DataFrame):
self._set_item_frame_value(key, value)
elif is_list_like(value) and 1 < len(
self.columns.get_indexer_for([key])
) == len(value):
elif (
is_list_like(value)
and not self.columns.is_unique
and 1 < len(self.columns.get_indexer_for([key])) == len(value)
):
# Column to set is duplicated
self._setitem_array([key], value)
else:
Expand Down Expand Up @@ -9824,26 +9828,28 @@ def _reduce(
assert filter_type is None or filter_type == "bool", filter_type
out_dtype = "bool" if filter_type == "bool" else None

own_dtypes = [arr.dtype for arr in self._iter_column_arrays()]
if numeric_only is None and name in ["mean", "median"]:
own_dtypes = [arr.dtype for arr in self._mgr.arrays]

dtype_is_dt = np.array(
[is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
dtype=bool,
)
if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
warnings.warn(
"DataFrame.mean and DataFrame.median with numeric_only=None "
"will include datetime64 and datetime64tz columns in a "
"future version.",
FutureWarning,
stacklevel=5,
dtype_is_dt = np.array(
[is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
dtype=bool,
)
# Non-copy equivalent to
# cols = self.columns[~dtype_is_dt]
# self = self[cols]
predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
mgr = self._mgr._get_data_subset(predicate)
self = type(self)(mgr)
if dtype_is_dt.any():
warnings.warn(
"DataFrame.mean and DataFrame.median with numeric_only=None "
"will include datetime64 and datetime64tz columns in a "
"future version.",
FutureWarning,
stacklevel=5,
)
# Non-copy equivalent to
# dt64_cols = self.dtypes.apply(is_datetime64_any_dtype)
# cols = self.columns[~dt64_cols]
# self = self[cols]
predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
mgr = self._mgr._get_data_subset(predicate)
self = type(self)(mgr)

# TODO: Make other agg func handle axis=None properly GH#21597
axis = self._get_axis_number(axis)
Expand Down
Loading

0 comments on commit 6e9882d

Please sign in to comment.