IAMconsortium · gidden · Feb 27, 2019 · Feb 26, 2019 · Feb 26, 2019 · Feb 26, 2019
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,6 +1,7 @@
 
 # Next Release
 
+- [#202](https://github.com/IAMconsortium/pyam/pull/202) Extend the `df.rename()` function with a `check_duplicates (default True)` validation option
 - [#199](https://github.com/IAMconsortium/pyam/pull/199) Initializing an `IamDataFrame` accepts kwargs to fill or create from the data any missing required columns
 - [#197](https://github.com/IAMconsortium/pyam/pull/197) Added a `normalize` function that normalizes all data in a data frame to a specific time period.
 - [#195](https://github.com/IAMconsortium/pyam/pull/195) Fix filtering for `time`, `day` and `hour` to use generic `pattern_match()` (if such a column exists) in 'year'-formmatted IamDataFrames

diff --git a/pyam/core.py b/pyam/core.py
@@ -531,7 +531,8 @@ def validate(self, criteria={}, exclude_on_fail=False):
 
             return df
 
-    def rename(self, mapping=None, inplace=False, append=False, **kwargs):
+    def rename(self, mapping=None, inplace=False, append=False,
+               check_duplicates=True, **kwargs):
         """Rename and aggregate column entries using `groupby.sum()` on values.
         When renaming models or scenarios, the uniqueness of the index must be
         maintained, and the function will raise an error otherwise.
@@ -551,6 +552,10 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs):
             if True, do operation inplace and return None
         append: bool, default False
             if True, append renamed timeseries to IamDataFrame
+        check_duplicates: bool, default True
+            check whether conflict between existing and renamed data exists.
+            If True, raise ValueError; if False, rename and merge
+            with `groupby().sum()`.
         """
         # combine `mapping` arg and mapping kwargs, ensure no rename conflicts
         mapping = mapping or {}
@@ -560,12 +565,16 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs):
             raise ValueError(msg)
         mapping.update(kwargs)
 
+        # determine columns that are not `model` or `scenario`
+        data_cols = set(self._LONG_IDX) - set(META_IDX)
+
         # changing index and data columns can cause model-scenario mismatch
         if any(i in mapping for i in META_IDX)\
-                and any(i in mapping for i in ['region', 'variable', 'unit']):
+                and any(i in mapping for i in data_cols):
             msg = 'Renaming index and data cols simultaneously not supported!'
             raise ValueError(msg)
 
+        # translate rename mapping to `filter()` arguments
         filters = {col: _from.keys() for col, _from in mapping.items()}
 
         # if append is True, downselect and append renamed data
@@ -581,6 +590,9 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs):
         rows = ret._apply_filters(filters)
         idx = ret.meta.index.isin(_make_index(ret.data[rows]))
 
+        # if `check_duplicates`, do the rename on a copy until after the check
+        _data = ret.data.copy() if check_duplicates else ret.data
+
         # apply renaming changes
         for col, _mapping in mapping.items():
             if col in META_IDX:
@@ -590,11 +602,23 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs):
                     raise ValueError('Renaming to non-unique `{}` index!'
                                      .format(col))
                 ret.meta.index = _index.set_index(META_IDX).index
-            elif col not in ['region', 'variable', 'unit']:
+            elif col not in data_cols:
                 raise ValueError('Renaming by `{}` not supported!'.format(col))
-            ret.data.loc[rows, col] = ret.data.loc[rows, col].replace(_mapping)
+            _data.loc[rows, col] = _data.loc[rows, col].replace(_mapping)
+
+        # check if duplicates exist between the renamed and not-renamed data
+        if check_duplicates:
+            merged = (
+                _data.loc[rows, self._LONG_IDX].drop_duplicates().append(
+                    _data.loc[~rows, self._LONG_IDX].drop_duplicates())
+            )
+            if any(merged.duplicated()):
+                msg = 'Duplicated rows between original and renamed data!\n{}'
+                conflict_rows = merged.loc[merged.duplicated(), self._LONG_IDX]
+                raise ValueError(msg.format(conflict_rows.drop_duplicates()))
 
-        ret.data = ret.data.groupby(ret._LONG_IDX).sum().reset_index()
+        # merge using `groupby().sum()`
+        ret.data = _data.groupby(ret._LONG_IDX).sum().reset_index()
 
         if not inplace:
             return ret
@@ -621,8 +645,8 @@ def convert_unit(self, conversion_mapping, inplace=False):
             return ret
 
     def normalize(self, inplace=False, **kwargs):
-        """Normalize data to a given value. Currently only supports normalizing to a
-        specific time
+        """Normalize data to a given value. Currently only supports normalizing
+        to a specific time.
 
         Parameters
         ----------

diff --git a/tests/test_feature_append_rename_convert.py b/tests/test_feature_append_rename_convert.py
@@ -6,7 +6,7 @@
 from numpy import testing as npt
 
 
-from pyam import IamDataFrame, META_IDX, IAMC_IDX
+from pyam import IamDataFrame, META_IDX, IAMC_IDX, compare
 
 
 RENAME_DF = IamDataFrame(pd.DataFrame([
@@ -181,6 +181,24 @@ def test_rename_append(meta_df):
     pd.testing.assert_frame_equal(obs.meta, exp)
 
 
+def test_rename_duplicates():
+    mapping = {'variable': {'test_1': 'test_3'}}
+    pytest.raises(ValueError, RENAME_DF.rename, **mapping)
+
+    obs = RENAME_DF.rename(check_duplicates=False, **mapping)
+
+    exp = IamDataFrame(pd.DataFrame([
+        ['model', 'scen', 'region_a', 'test_2', 'unit', 2, 6],
+        ['model', 'scen', 'region_a', 'test_3', 'unit', 4, 12],
+        ['model', 'scen', 'region_b', 'test_3', 'unit', 4, 8],
+    ], columns=['model', 'scenario', 'region',
+                'variable', 'unit', 2005, 2010],
+    ))
+
+    assert compare(obs, exp).empty
+    pd.testing.assert_frame_equal(obs.data, exp.data)
+
+
 def test_convert_unit():
     df = IamDataFrame(pd.DataFrame([
         ['model', 'scen', 'SST', 'test_1', 'A', 1, 5],