ENH: add integer-na support via an ExtensionArray

closes pandas-dev#20700 closes pandas-dev#20747
jreback · Jul 13, 2018 · 4586245 · 4586245
1 parent 365eac4
commit 4586245
Show file tree

Hide file tree

Showing 21 changed files with 1,386 additions and 67 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -8,6 +8,7 @@ v0.24.0 (Month XX, 2018)
 New features
 ~~~~~~~~~~~~
 
+
 - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
 
 .. _whatsnew_0240.enhancements.extension_array_operators:
@@ -26,6 +27,61 @@ See the :ref:`ExtensionArray Operator Support
 <extending.extension.operator>` documentation section for details on both
 ways of adding operator support.
 
+.. _whatsnew_0240.enhancements.intna:
+
+Integer NA Support
+^^^^^^^^^^^^^^^^^^
+
+Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled thru the use of ``ExtensionTypes`` . Here is an example of the usage.
+
+We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying an list or array using the traditional missing value
+marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`)
+
+.. ipython:: python
+
+   s = pd.Series([1, 2, np.nan], dtype='Int64')
+   s
+
+
+Operations on these dtypes will propagate ``NaN`` as other pandas operations.
+
+.. ipython:: python
+
+   # arithmetic
+   s + 1
+
+   # comparison
+   s == 1
+
+   # indexing
+   s.iloc[1:3]
+
+   # operate with other dtypes
+   s + s.iloc[1:3]
+
+   # coerce when needed
+   s + 0.01
+
+These dtypes can operate as part of ``DataFrames``.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')})
+   df
+   df.dtypes
+
+
+These dtypes can be merged & reshaped & casted.
+
+.. ipython:: python
+
+   pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
+   df['A'].astype(float)
+
+.. warning::
+
+   The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This maybe changed at a future date.
+
 .. _whatsnew_0240.enhancements.read_html:
 
 ``read_html`` Enhancements
@@ -182,6 +238,7 @@ Previous Behavior:
 ExtensionType Changes
 ^^^^^^^^^^^^^^^^^^^^^
 
+- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
 - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
   the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
 - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)

diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -1,6 +1,9 @@
 from .base import (ExtensionArray,    # noqa
+                   ExtensionOpsMixin,
                    ExtensionScalarOpsMixin)
 from .categorical import Categorical  # noqa
 from .datetimes import DatetimeArrayMixin  # noqa
 from .period import PeriodArrayMixin  # noqa
 from .timedelta import TimedeltaArrayMixin  # noqa
+from .integer import (  # noqa
+    IntegerArray, to_integer_array)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -12,8 +12,8 @@
 from pandas.errors import AbstractMethodError
 from pandas.compat.numpy import function as nv
 from pandas.compat import set_function_name, PY3
-from pandas.core.dtypes.common import is_list_like
 from pandas.core import ops
+from pandas.core.dtypes.common import is_list_like
 
 _not_implemented_message = "{} does not implement {}."
 
@@ -88,14 +88,16 @@ class ExtensionArray(object):
     # Constructors
     # ------------------------------------------------------------------------
     @classmethod
-    def _from_sequence(cls, scalars, copy=False):
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars.
 
         Parameters
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
             array, ``cls.dtype.type``.
+        dtype : Dtype, optional
+            consruct for this particular dtype
         copy : boolean, default False
             if True, copy the underlying data
         Returns
@@ -378,7 +380,7 @@ def fillna(self, value=None, method=None, limit=None):
                 func = pad_1d if method == 'pad' else backfill_1d
                 new_values = func(self.astype(object), limit=limit,
                                   mask=mask)
-                new_values = self._from_sequence(new_values)
+                new_values = self._from_sequence(new_values, dtype=self.dtype)
             else:
                 # fill with value
                 new_values = self.copy()
@@ -407,7 +409,7 @@ def unique(self):
         from pandas import unique
 
         uniques = unique(self.astype(object))
-        return self._from_sequence(uniques)
+        return self._from_sequence(uniques, dtype=self.dtype)
 
     def _values_for_factorize(self):
         # type: () -> Tuple[ndarray, Any]
@@ -559,7 +561,7 @@ def take(self, indices, allow_fill=False, fill_value=None):
 
                result = take(data, indices, fill_value=fill_value,
                              allow_fill=allow_fill)
-               return self._from_sequence(result)
+               return self._from_sequence(result, dtype=self.dtype)
         """
         # Implementer note: The `fill_value` parameter should be a user-facing
         # value, an instance of self.dtype.type. When passed `fill_value=None`,

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -487,8 +487,8 @@ def _constructor(self):
         return Categorical
 
     @classmethod
-    def _from_sequence(cls, scalars):
-        return Categorical(scalars)
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return Categorical(scalars, dtype=dtype)
 
     def copy(self):
         """ Copy constructor. """