pandas-dev · jreback · Sep 3, 2020 · Jun 7, 2020 · Jun 8, 2020 · Jun 8, 2020
diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst
@@ -0,0 +1,172 @@
+.. _duplicates:
+
+****************
+Duplicate Labels
+****************
+
+:class:`Index` objects are not required to be unique; you can have duplicate row
+or column labels. This may be a bit confusing at first. If you're familiar with
+SQL, you know that row labels are similar to a primary key on a table, and you
+would never want duplicates in a SQL table. But one of pandas' roles is to clean
+messy, real-world data before it goes to some downstream system. And real-world
+data has duplicates, even in fields that are supposed to be unique.
+
+This section describes how duplicate labels change the behavior of certain
+operations, and how prevent duplicates from arising during operations, or to
+detect them if they do.
+
+.. ipython:: python
+
+   import pandas as pd
+   import numpy as np
+
+Consequences of Duplicate Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some pandas methods (:meth:`Series.reindex` for example) just don't work with
+duplicates present. The output can't be determined, and so pandas raises.
+
+.. ipython:: python
+   :okexcept:
+
+   s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b'])
+   s1.reindex(['a', 'b', 'c'])
+
+Other methods, like indexing, can give very surprising results. Typically
+indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame``
+with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will
+return a scalar. But with duplicates, this isn't the case.
+
+.. ipython:: python
+
+   df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B'])
+   df1
+
+We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series``
+
+.. ipython:: python
+
+   df1['B']  # a series
+
+But slicing ``'A'`` returns a ``DataFrame``
+
+
+.. ipython:: python
+
+   df1['A']  # a DataFrame
+
+This applies to row labels as well
+
+.. ipython:: python
+
+   df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b'])
+   df2
+   df2.loc['b', 'A']  # a scalar
+   df2.loc['a', 'A']  # a Series
+
+Duplicate Label Detection
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can check whether an :class:`Index` (storing the row or column labels) is
+unique with :attr:`Index.is_unique`:
+
+.. ipython:: python
+
+   df2
+   df2.index.is_unique
+   df2.columns.is_unique
+
+.. note::
+
+   Checking whether an index is unique is somewhat expensive for large datasets.
+   Pandas does cache this result, so re-checking on the same index is very fast.
+
+:meth:`Index.duplicated` will return a boolean ndarray indicating whether a
+label is a repeat.
+
+.. ipython:: python
+
+   df2.index.duplicated()
+
+Which can be used as a boolean filter to drop duplicate rows.
+
+.. ipython:: python
+
+   df2.loc[~df2.index.duplicated(), :]
+
+If you need additional logic to handle duplicate labels, rather than just
+dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common
+trick. For example, we'll resolve duplicates by taking the average of all rows
+with the same label.
+
+.. ipython:: python
+
+   df2.groupby(level=0).mean()
+
+.. _duplicates.disallow:
+
+Disallowing Duplicate Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.1.0
+
+As noted above, handling duplicates is an important feature when reading in raw
+data. That said, you may want to avoid introducing duplicates as part of a data
+processing pipeline (from methods like :meth:`pandas.concat`,
+:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame`
+can be created with the argument ``allows_duplicate_labels=False`` to *disallow*
+duplicate labels (the default is to allow them). If there are duplicate labels,
+an exception will be raised.
+
+.. ipython:: python
+   :okexcept:
+
+   pd.Series([0, 1, 2], index=['a', 'b', 'b'], allows_duplicate_labels=False)
+
+This applies to both row and column labels for a :class:`DataFrame`
+
+.. ipython:: python
+   :okexcept:
+
+   pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],
+                allows_duplicate_labels=False)
+
+This attribute can be checked or set with :attr:`~DataFrame.allows_duplicate_labels`,
+which indicates whether that object can have duplicate labels.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [0, 1, 2, 3]},
+                     index=['x', 'y', 'X', 'Y'],
+                     allows_duplicate_labels=False)
+   df
+   df.allows_duplicate_labels
+
+Performing an operation that introduces duplicate labels on a ``Series`` or
+``DataFrame`` that disallows duplicates will raise an
+:class:`errors.DuplicateLabelError`.
+
+.. ipython:: python
+   :okexcept:
+
+   df.rename(str.upper)
+
+Duplicate Label Propagation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In general, disallowing duplicates is "sticky". It's preserved through
+operations.
+
+.. ipython:: python
+   :okexcept:
+
+   s1 = pd.Series(0, index=['a', 'b'], allows_duplicate_labels=False)
+   s1
+   s1.head().rename({"a": "b"})
+
+.. warning::
+
+   This is an experimental feature. Currently, many methods fail to
+   propagate the ``allows_duplicate_labels`` value. In future versions
+   it is expected that every method taking or returning one or more
+   DataFrame or Series objects will propagate ``allows_duplicate_labels``.
diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst
@@ -33,6 +33,7 @@ Further information on any specific method can be obtained in the
     reshaping
     text
     missing_data
+    duplicates
     categorical
     integer_na
     boolean

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -203,6 +203,45 @@ For example:
     pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True)
     pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z')
 
+.. _whatsnew_110.duplicate_labels:
+
+Optionally disallow duplicate labels
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to
+control whether the index or columns can contain duplicate labels. This can be used to prevent accidental
+introduction of duplicate labels, which can affect downstream operations.
+
+By default, duplicates continue to be allowed
+
+.. ipython:: python
+
+   pd.Series([1, 2], index=['a', 'a'])
+
+.. ipython:: python
+   :okexcept:
+
+   pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False)
+
+Pandas will propagate the ``allows_duplicate_labels`` property through many operations.
+
+.. ipython:: python
+   :okexcept:
+
+   a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False)
+   a
+   # An operation introducing duplicates
+   a.reindex(['a', 'b', 'a'])
+
+.. warning::
+
+   This is an experimental feature. Currently, many methods fail to
+   propagate the ``allows_duplicate_labels`` value. In future versions
+   it is expected that every method taking or returning one or more
+   DataFrame or Series objects will propagate ``allows_duplicate_labels``.
+
+See :ref:`duplicates` for more.
+
 .. _whatsnew_110.grouper_resample_origin:
 
 Grouper and resample now supports the arguments origin and offset

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -360,6 +360,22 @@ class DataFrame(NDFrame):
         Data type to force. Only a single dtype is allowed. If None, infer.
     copy : bool, default False
         Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+    allows_duplicate_labels : bool, default True
+        Whether to allow duplicate row or column labels in this DataFrame.
+        By default, duplicate labels are permitted. Setting this to ``False``
+        will cause an :class:`errors.DuplicateLabelError` to be raised when
+        `index` or `columns` are not unique, or when any subsequent operation
+        on this DataFrame introduces duplicates. See :ref:`duplicates.disallow`
+        for more.
+
+        .. versionadded:: 1.1.0
+
+        .. warning::
+
+           This is an experimental feature. Currently, many methods fail to
+           propagate the ``allows_duplicate_labels`` value. In future versions
+           it is expected that every method taking or returning one or more
+           DataFrame or Series objects will propagate ``allows_duplicate_labels``.
 
     See Also
     --------
@@ -436,6 +452,7 @@ def __init__(
         columns: Optional[Axes] = None,
         dtype: Optional[Dtype] = None,
         copy: bool = False,
+        allows_duplicate_labels=True,
     ):
         if data is None:
             data = {}
@@ -448,7 +465,9 @@ def __init__(
         if isinstance(data, BlockManager):
             if index is None and columns is None and dtype is None and copy is False:
                 # GH#33357 fastpath
-                NDFrame.__init__(self, data)
+                NDFrame.__init__(
+                    self, data, allows_duplicate_labels=allows_duplicate_labels
+                )
                 return
 
             mgr = self._init_mgr(
@@ -534,7 +553,7 @@ def __init__(
             else:
                 raise ValueError("DataFrame constructor not properly called!")
 
-        NDFrame.__init__(self, mgr)
+        NDFrame.__init__(self, mgr, allows_duplicate_labels=allows_duplicate_labels)
 
     # ----------------------------------------------------------------------
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -196,6 +196,7 @@ def __init__(
         self,
         data: BlockManager,
         copy: bool = False,
+        allows_duplicate_labels: bool = True,
         attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
     ):
         # copy kwarg is retained for mypy compat, is not used
@@ -208,6 +209,7 @@ def __init__(
         else:
             attrs = dict(attrs)
         object.__setattr__(self, "_attrs", attrs)
+        object.__setattr__(self, "allows_duplicate_labels", allows_duplicate_labels)
 
     @classmethod
     def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
@@ -246,6 +248,23 @@ def attrs(self) -> Dict[Optional[Hashable], Any]:
     def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
         self._attrs = dict(value)
 
+    @property
+    def allows_duplicate_labels(self) -> bool:
+        """
+        Whether this object allows duplicate labels.
+        """
+        return self._allows_duplicate_labels
+
+    @allows_duplicate_labels.setter
+    def allows_duplicate_labels(self, value: bool):
+        value = bool(value)
+        if not value:
+            for ax in self.axes:
+                ax._maybe_check_unique()
+
+        # avoid `can_hold_identifiers` check.
+        object.__setattr__(self, "_allows_duplicate_labels", value)
+
     @classmethod
     def _validate_dtype(cls, dtype):
         """ validate the passed dtype """
@@ -1841,6 +1860,10 @@ def __setstate__(self, state):
             if typ is not None:
                 attrs = state.get("_attrs", {})
                 object.__setattr__(self, "_attrs", attrs)
+                allows_duplicate_labels = state.get("_allows_duplicate_labels", True)
+                object.__setattr__(
+                    self, "_allows_duplicate_labels", allows_duplicate_labels
+                )
 
                 # set in the order of internal names
                 # to avoid definitional recursion
@@ -5205,10 +5228,17 @@ def __finalize__(
         if isinstance(other, NDFrame):
             for name in other.attrs:
                 self.attrs[name] = other.attrs[name]
+
+            self.allows_duplicate_labels = other.allows_duplicate_labels
             # For subclasses using _metadata.
             for name in self._metadata:
                 assert isinstance(name, str)
                 object.__setattr__(self, name, getattr(other, name, None))
+
+        if method == "concat":
+            allows_duplicate_labels = all(x.allows_duplicate_labels for x in other.objs)
+            self.allows_duplicate_labels = allows_duplicate_labels
+
         return self
 
     def __getattr__(self, name: str):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -473,6 +473,26 @@ def _simple_new(cls, values, name: Label = None):
     def _constructor(self):
         return type(self)
 
+    def _maybe_check_unique(self):
+        from pandas.errors import DuplicateLabelError
+
+        if not self.is_unique:
+            # TODO: position, value, not too large.
+            msg = """Index has duplicates."""
+            duplicates = self._format_duplicate_message()
+            msg += "\n{}".format(duplicates)
+
+            raise DuplicateLabelError(msg)
+
+    def _format_duplicate_message(self):
+        from pandas import Series
+
+        duplicates = self[self.duplicated(keep="first")].unique()
+        assert len(duplicates)
+
+        out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates]
+        return out.rename_axis("label").to_frame(name="positions")
+
     # --------------------------------------------------------------------
     # Index Internals Methods