From fcb81fe0fbd908a0ae64e70b8ff684c8160de617 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Fri, 22 Feb 2019 09:24:21 +0100
Subject: [PATCH] Initialize an `IamDataFrame` from `pd.DataFrame` with
 formatting specs (#199)

* initial impl for giving default pyam dataframes

* move again into core

* small fixes

* update concat to try to cast to dataframe

* appease stickler

* fix `TypeError` in `concat()`

* add `concat_with_pipe()` in `utils` with some tests

* cast elements in `concat_with_pipe()` to `str`

* drop `nan` instead of casting to `str`

* insert function `reduce_hierarchy()` and add tests

* raise error if initializing `IamDataFrame`with duplicate rows in `data`

* remove unused legacy function

* add auxiliary function `sort_data()` for consistent ordering of cols

* use `sort_data()` in `append()`

* add `sort_data()` to some unit tests to make them pass

* refactor and add first test for `df_to_pyam()`

* clean-up before stickler gives me an earful...

* clean-up of implementation of `df_to_pyam()`

* update docstrings

* allow input-df with columns `year` and `time`

* defining `defaults` doesn't seem to be necessary

* require `value` arg in `df_to_pyam()` instead of using all columns

* check for conflicts with existing columns, add test

* add option in `df_to_pyam()` to rename cols

* add feature to concat required column from multiple given columns

* make `cast_to_iam` partof `format_data()`

* rename function to `read_file()` because it can only read one file

* pass kwargs for `format_data` through `read_file()`

* pep8 and docstring cleanup

* add to release notes

* minor edits as requested by @gidden in the review
---
 RELEASE_NOTES.md           |  1 +
 pyam/core.py               | 53 +++++++++++-----------
 pyam/utils.py              | 90 ++++++++++++++++++++++++++++++--------
 tests/test_cast_to_iamc.py | 53 ++++++++++++++++++++++
 tests/test_core.py         | 14 ++++--
 tests/test_utils.py        | 41 +++++++++++++++++
 6 files changed, 203 insertions(+), 49 deletions(-)
 create mode 100644 tests/test_cast_to_iamc.py

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 6a82f2482..04c4b1054 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,6 +1,7 @@
 
 # Next Release
 
+- [#199](https://github.com/IAMconsortium/pyam/pull/199) Initializing an `IamDataFrame` accepts kwargs to fill or create from the data any missing required columns
 - [#195](https://github.com/IAMconsortium/pyam/pull/195) Fix filtering for `time`, `day` and `hour` to use generic `pattern_match()` (if such a column exists) in 'year'-formmatted IamDataFrames
 - [#192](https://github.com/IAMconsortium/pyam/pull/192) Extend `utils.find_depth()` to optionally return depth (as list of ints) rather than assert level tests
 - [#190](https://github.com/IAMconsortium/pyam/pull/190) Add `concat()` function
diff --git a/pyam/core.py b/pyam/core.py
index 1b4747413..1469b165e 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -20,9 +20,10 @@
 from pyam.run_control import run_control
 from pyam.utils import (
     write_sheet,
-    read_files,
+    read_file,
     read_pandas,
     format_data,
+    sort_data,
     to_int,
     find_depth,
     pattern_match,
@@ -38,7 +39,6 @@
     REGION_IDX,
     IAMC_IDX,
     SORT_IDX,
-    LONG_IDX,
     GROUP_IDX
 )
 from pyam.read_ixmp import read_ix
@@ -50,30 +50,29 @@ class IamDataFrame(object):
     It provides a number of diagnostic features (including validation of data,
     completeness of variables provided) as well as a number of visualization
     and plotting tools.
-    """
 
+    Parameters
+    ----------
+    data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file
+        an instance of an TimeSeries or Scenario (requires `ixmp`),
+        or pd.DataFrame or data file with IAMC-format data columns.
+        A pd.DataFrame can have the required data as columns or index.
+    kwargs:
+        if `value=col`, melt `col` to `value` and use `col` name as `variable`;
+        else, mapping of columns required for an `IamDataFrame` to:
+        - one column in `df`
+        - multiple columns, which will be concatenated by pipe
+        - a string to be used as value for this column
+    """
     def __init__(self, data, **kwargs):
-        """Initialize an instance of an IamDataFrame
-
-        Parameters
-        ----------
-        data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file
-            an instance of an TimeSeries or Scenario (requires `ixmp`),
-            or pd.DataFrame or data file with IAMC-format data columns.
-            A pd.DataFrame can have the required data as columns or index.
-
-            Special support is provided for data files downloaded directly from
-            IIASA SSP and RCP databases. If you run into any problems loading
-            data, please make an issue at:
-            https://github.com/IAMconsortium/pyam/issues
-        """
+        """Initialize an instance of an IamDataFrame"""
         # import data from pd.DataFrame or read from source
         if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
-            _data = format_data(data.copy())
+            _data = format_data(data.copy(), **kwargs)
         elif has_ix and isinstance(data, ixmp.TimeSeries):
             _data = read_ix(data, **kwargs)
         else:
-            _data = read_files(data, **kwargs)
+            _data = read_file(data, **kwargs)
 
         self.data, self.time_col, self.extra_cols = _data
         # cast time_col to desired format
@@ -181,6 +180,7 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
             any meta columns present in `self` and `other` are not identical.
         inplace : bool, default False
             If True, do operation inplace and return None
+        kwargs are passed through to `IamDataFrame(other, **kwargs)`
         """
         if not isinstance(other, IamDataFrame):
             other = IamDataFrame(other, **kwargs)
@@ -224,15 +224,14 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
             ret.meta = ret.meta.append(other.meta.loc[diff, :], **sort_kwarg)
 
         # append other.data (verify integrity for no duplicates)
-        ret.data.set_index(ret._LONG_IDX, inplace=True)
-        _other = other.data.set_index(other._LONG_IDX)
-        ret.data = ret.data.append(_other, verify_integrity=True)\
-            .reset_index(drop=False)
+        _data = ret.data.set_index(ret._LONG_IDX).append(
+            other.data.set_index(other._LONG_IDX), verify_integrity=True)
 
         # merge extra columns in `data` and set `LONG_IDX`
         ret.extra_cols += [i for i in other.extra_cols
                            if i not in ret.extra_cols]
         ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols
+        ret.data = sort_data(_data.reset_index(), ret._LONG_IDX)
 
         if not inplace:
             return ret
@@ -1501,13 +1500,13 @@ def compare(left, right, left_label='left', right_label='right',
 
 def concat(dfs):
     """Concatenate a series of `pyam.IamDataFrame`-like objects together"""
-    if not hasattr(dfs, '__iter__'):
-        raise TypeError('Input data must be iterable (e.g., list or tuple)')
+    if isstr(dfs) or not hasattr(dfs, '__iter__'):
+        msg = 'Argument must be a non-string iterable (e.g., list or tuple)'
+        raise TypeError(msg)
 
     _df = None
     for df in dfs:
-        if not isinstance(df, IamDataFrame):
-            raise TypeError('Input contains non-`pyam.IamDataFrame`')
+        df = df if isinstance(df, IamDataFrame) else IamDataFrame(df)
         if _df is None:
             _df = copy.deepcopy(df)
         else:
diff --git a/pyam/utils.py b/pyam/utils.py
index 8a0a82c5c..7351fcdde 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -112,22 +112,63 @@ def read_pandas(fname, *args, **kwargs):
     return df
 
 
-def read_files(fnames, *args, **kwargs):
-    """Read data from a snapshot file saved in the standard IAMC format
+def read_file(fname, *args, **kwargs):
+    """Read data from a file saved in the standard IAMC format
     or a table with year/value columns
     """
-    if not isstr(fnames):
+    if not isstr(fname):
         raise ValueError('reading multiple files not supported, '
                          'please use `pyam.IamDataFrame.append()`')
-    logger().info('Reading `{}`'.format(fnames))
-    return format_data(read_pandas(fnames, *args, **kwargs))
+    logger().info('Reading `{}`'.format(fname))
+    format_kwargs = {}
+    # extract kwargs that are intended for `format_data`
+    for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]:
+        format_kwargs[c] = kwargs.pop(c)
+    return format_data(read_pandas(fname, *args, **kwargs), **format_kwargs)
 
 
-def format_data(df):
-    """Convert an imported dataframe and check all required columns"""
+def format_data(df, **kwargs):
+    """Convert a `pd.Dataframe` or `pd.Series` to the required format"""
     if isinstance(df, pd.Series):
         df = df.to_frame()
 
+    # ensure that only either `value` or `variable` custom setting is used
+    _cols = ['value', 'variable']
+    if any([i in kwargs for i in _cols]) and \
+            all([i in kwargs or i in df.columns for i in _cols]):
+        raise ValueError('using both `value` and `variable` is not valid!')
+
+    # if `value` arg is given, melt columns and use column name as `variable`
+    if 'value' in kwargs:
+        value = kwargs.pop('value')
+        idx = set(df.columns) & (set(IAMC_IDX) | set(['year', 'time']))
+        _df = df.set_index(list(idx))
+        print(_df)
+        dfs = []
+        for v in value if islistable(value) else [value]:
+            if v not in df.columns:
+                raise ValueError('column `{}` does not exist!'.format(v))
+            vdf = _df[v].to_frame().rename(columns={v: 'value'})
+            vdf['variable'] = v
+            dfs.append(vdf.reset_index())
+        df = pd.concat(dfs).reset_index(drop=True)
+
+    # for other columns, do a rename or concat multiple columns to IAMC-style
+    for col, value in kwargs.items():
+        if col in df:
+            raise ValueError('conflict of kwarg with column in dataframe!')
+
+        if isstr(value) and value in df:
+            df.rename(columns={value: col}, inplace=True)
+        elif islistable(value) and all([c in df.columns for c in value]):
+            df[col] = df.apply(lambda x: concat_with_pipe(x, value), axis=1)
+            df.drop(value, axis=1, inplace=True)
+        elif isstr(value):
+            df[col] = value
+        else:
+            raise ValueError('invalid argument for casting `{}: {}`'
+                             .format(col, value))
+
     # all lower case
     str_cols = [c for c in df.columns if isstr(c)]
     df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
@@ -157,9 +198,9 @@ def format_data(df):
     if 'value' in df.columns:
         # check if time column is given as `year` (int) or `time` (datetime)
         cols = df.columns
-        if 'year' in cols and 'time' not in cols:
+        if 'year' in cols:
             time_col = 'year'
-        elif 'time' in cols and 'year' not in cols:
+        elif 'time' in cols:
             time_col = 'time'
         else:
             msg = 'invalid time format, must have either `year` or `time`!'
@@ -194,18 +235,18 @@ def format_data(df):
     # cast value columns to numeric, drop NaN's, sort data
     df['value'] = df['value'].astype('float64')
     df.dropna(inplace=True)
-    df.sort_values(META_IDX + ['variable', time_col, 'region'] + extra_cols,
-                   inplace=True)
 
-    return df, time_col, extra_cols
+    # check for duplicates and return sorted data
+    idx_cols = IAMC_IDX + [time_col] + extra_cols
+    if any(df[idx_cols].duplicated()):
+        raise ValueError('duplicate rows in `data`!')
 
+    return sort_data(df, idx_cols), time_col, extra_cols
 
-def style_df(df, style='heatmap'):
-    if style == 'highlight_not_max':
-        return df.style.apply(lambda s: ['' if v else 'background-color: yellow' for v in s == s.max()])
-    if style == 'heatmap':
-        cm = sns.light_palette("green", as_cmap=True)
-        return df.style.background_gradient(cmap=cm)
+
+def sort_data(data, cols):
+    """Sort `data` rows and order columns"""
+    return data.sort_values(cols)[cols + ['value']].reset_index(drop=True)
 
 
 def find_depth(data, s='', level=None):
@@ -395,3 +436,16 @@ def to_int(x, index=False):
         return x
     else:
         return _x
+
+
+def concat_with_pipe(x, cols=None):
+    """Concatenate a `pd.Series` separated by `|`, drop `None` or `np.nan`"""
+    cols = cols or x.index
+    return '|'.join([x[i] for i in cols if x[i] not in [None, np.nan]])
+
+
+def reduce_hierarchy(x, depth):
+    """Reduce the hierarchy (depth by `|`) string to the specified level"""
+    _x = x.split('|')
+    depth = len(_x) + depth - 1 if depth < 0 else depth
+    return '|'.join(_x[0:(depth + 1)])
diff --git a/tests/test_cast_to_iamc.py b/tests/test_cast_to_iamc.py
new file mode 100644
index 000000000..a53b91528
--- /dev/null
+++ b/tests/test_cast_to_iamc.py
@@ -0,0 +1,53 @@
+import pytest
+import pandas as pd
+from pyam import IamDataFrame, compare
+
+
+def test_cast_from_value_col(meta_df):
+    df_with_value_cols = pd.DataFrame([
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2010, 6., 3],
+        ['model_a', 'scen_b', 'World', 'EJ/y', 2005, 2, None],
+        ['model_a', 'scen_b', 'World', 'EJ/y', 2010, 7, None]
+    ],
+        columns=['model', 'scenario', 'region', 'unit', 'year',
+                 'Primary Energy', 'Primary Energy|Coal'],
+    )
+    df = IamDataFrame(df_with_value_cols,
+                      value=['Primary Energy', 'Primary Energy|Coal'])
+
+    assert compare(meta_df, df).empty
+    pd.testing.assert_frame_equal(df.data, meta_df.data)
+
+
+def test_cast_with_model_arg_raises():
+    df = pd.DataFrame([
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
+    ],
+        columns=['model', 'scenario', 'region', 'unit', 'year',
+                 'Primary Energy', 'Primary Energy|Coal'],
+    )
+    pytest.raises(ValueError, IamDataFrame, df, model='foo')
+
+
+def test_cast_with_model_arg(meta_df):
+    df = meta_df.timeseries().reset_index()
+    df.rename(columns={'model': 'foo'}, inplace=True)
+
+    df = IamDataFrame(df, model='foo')
+    assert compare(meta_df, df).empty
+    pd.testing.assert_frame_equal(df.data, meta_df.data)
+
+
+def test_cast_by_column_concat(meta_df):
+    df = pd.DataFrame([
+        ['scen_a', 'World', 'Primary Energy', None, 'EJ/y', 1, 6.],
+        ['scen_a', 'World', 'Primary Energy', 'Coal', 'EJ/y', 0.5, 3],
+        ['scen_b', 'World', 'Primary Energy', None, 'EJ/y', 2, 7],
+    ],
+        columns=['scenario', 'region', 'var_1', 'var_2', 'unit', 2005, 2010],
+    )
+
+    df = IamDataFrame(df, model='model_a', variable=['var_1', 'var_2'])
+    assert compare(meta_df, df).empty
+    pd.testing.assert_frame_equal(df.data, meta_df.data)
diff --git a/tests/test_core.py b/tests/test_core.py
index eb03c33c5..13c98e8dd 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -8,7 +8,7 @@
 from numpy import testing as npt
 
 from pyam import IamDataFrame, validate, categorize, \
-    require_variable, filter_by_meta, META_IDX, IAMC_IDX
+    require_variable, filter_by_meta, META_IDX, IAMC_IDX, sort_data
 from pyam.core import _meta_idx, concat
 
 from conftest import TEST_DATA_DIR
@@ -39,6 +39,12 @@ def test_init_df_with_float_cols_raises(test_pd_df):
     pytest.raises(ValueError, IamDataFrame, data=_test_df)
 
 
+def test_init_df_with_duplicates_raises(test_df):
+    _df = test_df.timeseries()
+    _df = _df.append(_df.iloc[0]).reset_index()
+    pytest.raises(ValueError, IamDataFrame, data=_df)
+
+
 def test_init_df_with_float_cols(test_pd_df):
     _test_df = test_pd_df.rename(columns={2005: 2005., 2010: 2010.})
     obs = IamDataFrame(_test_df).timeseries().reset_index()
@@ -772,7 +778,7 @@ def test_filter_by_int(meta_df):
 def _r5_regions_exp(df):
     df = df.filter(region='World', keep=False)
     df['region'] = 'R5MAF'
-    return df.data.reset_index(drop=True)
+    return sort_data(df.data, df._LONG_IDX)
 
 
 def test_map_regions_r5(reg_df):
@@ -841,7 +847,7 @@ def test_48b():
         ['model', 'scen1', 'SDN', 'var', 'unit', 2, 7],
     ], columns=['model', 'scenario', 'region',
                 'variable', 'unit', 2005, 2010],
-    )).data.reset_index(drop=True)
+    )).data
 
     df = IamDataFrame(pd.DataFrame([
         ['model', 'scen', 'R5MAF', 'var', 'unit', 1, 6],
@@ -850,7 +856,7 @@ def test_48b():
                 'variable', 'unit', 2005, 2010],
     ))
     obs = df.map_regions('iso', region_col='r5_region').data
-    obs = obs[obs.region.isin(['SSD', 'SDN'])].reset_index(drop=True)
+    obs = sort_data(obs[obs.region.isin(['SSD', 'SDN'])], df._LONG_IDX)
 
     pd.testing.assert_frame_equal(obs, exp, check_index_type=False)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6484d2adc..ae66d2508 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -4,6 +4,7 @@
 from pyam import utils
 
 TEST_VARS = pd.Series(['foo', 'foo|bar', 'foo|bar|baz'])
+TEST_CONCAT_SERIES = pd.Series(['foo', 'bar', 'baz'], index=['f', 'b', 'z'])
 
 
 def test_pattern_match_none():
@@ -145,3 +146,43 @@ def test_find_depth_1_minus():
 def test_find_depth_1_plus():
     obs = utils.find_depth(TEST_VARS, level='1+')
     assert obs == [False, True, True]
+
+
+def test_concat_with_pipe_all():
+    obs = utils.concat_with_pipe(TEST_CONCAT_SERIES)
+    assert obs == 'foo|bar|baz'
+
+
+def test_concat_with_pipe_exclude_none():
+    s = TEST_CONCAT_SERIES.copy()
+    s['b'] = None
+    obs = utils.concat_with_pipe(s)
+    assert obs == 'foo|baz'
+
+
+def test_concat_with_pipe_exclude_nan():
+    s = TEST_CONCAT_SERIES.copy()
+    s['b'] = np.nan
+    obs = utils.concat_with_pipe(s)
+    assert obs == 'foo|baz'
+
+
+def test_concat_with_pipe_by_name():
+    obs = utils.concat_with_pipe(TEST_CONCAT_SERIES, ['f', 'z'])
+    assert obs == 'foo|baz'
+
+
+def test_reduce_hierarchy_0():
+    assert utils.reduce_hierarchy('foo|bar|baz', 0) == 'foo'
+
+
+def test_reduce_hierarchy_1():
+    assert utils.reduce_hierarchy('foo|bar|baz', 1) == 'foo|bar'
+
+
+def test_reduce_hierarchy_neg1():
+    assert utils.reduce_hierarchy('foo|bar|baz', -1) == 'foo|bar'
+
+
+def test_reduce_hierarchy_neg2():
+    assert utils.reduce_hierarchy('foo|bar|baz', -2) == 'foo'