From fcb81fe0fbd908a0ae64e70b8ff684c8160de617 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 22 Feb 2019 09:24:21 +0100 Subject: [PATCH] Initialize an `IamDataFrame` from `pd.DataFrame` with formatting specs (#199) * initial impl for giving default pyam dataframes * move again into core * small fixes * update concat to try to cast to dataframe * appease stickler * fix `TypeError` in `concat()` * add `concat_with_pipe()` in `utils` with some tests * cast elements in `concat_with_pipe()` to `str` * drop `nan` instead of casting to `str` * insert function `reduce_hierarchy()` and add tests * raise error if initializing `IamDataFrame`with duplicate rows in `data` * remove unused legacy function * add auxiliary function `sort_data()` for consistent ordering of cols * use `sort_data()` in `append()` * add `sort_data()` to some unit tests to make them pass * refactor and add first test for `df_to_pyam()` * clean-up before stickler gives me an earful... * clean-up of implementation of `df_to_pyam()` * update docstrings * allow input-df with columns `year` and `time` * defining `defaults` doesn't seem to be necessary * require `value` arg in `df_to_pyam()` instead of using all columns * check for conflicts with existing columns, add test * add option in `df_to_pyam()` to rename cols * add feature to concat required column from multiple given columns * make `cast_to_iam` partof `format_data()` * rename function to `read_file()` because it can only read one file * pass kwargs for `format_data` through `read_file()` * pep8 and docstring cleanup * add to release notes * minor edits as requested by @gidden in the review --- RELEASE_NOTES.md | 1 + pyam/core.py | 53 +++++++++++----------- pyam/utils.py | 90 ++++++++++++++++++++++++++++++-------- tests/test_cast_to_iamc.py | 53 ++++++++++++++++++++++ tests/test_core.py | 14 ++++-- tests/test_utils.py | 41 +++++++++++++++++ 6 files changed, 203 insertions(+), 49 deletions(-) create mode 100644 tests/test_cast_to_iamc.py diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 6a82f2482..04c4b1054 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,7 @@ # Next Release +- [#199](https://github.com/IAMconsortium/pyam/pull/199) Initializing an `IamDataFrame` accepts kwargs to fill or create from the data any missing required columns - [#195](https://github.com/IAMconsortium/pyam/pull/195) Fix filtering for `time`, `day` and `hour` to use generic `pattern_match()` (if such a column exists) in 'year'-formmatted IamDataFrames - [#192](https://github.com/IAMconsortium/pyam/pull/192) Extend `utils.find_depth()` to optionally return depth (as list of ints) rather than assert level tests - [#190](https://github.com/IAMconsortium/pyam/pull/190) Add `concat()` function diff --git a/pyam/core.py b/pyam/core.py index 1b4747413..1469b165e 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -20,9 +20,10 @@ from pyam.run_control import run_control from pyam.utils import ( write_sheet, - read_files, + read_file, read_pandas, format_data, + sort_data, to_int, find_depth, pattern_match, @@ -38,7 +39,6 @@ REGION_IDX, IAMC_IDX, SORT_IDX, - LONG_IDX, GROUP_IDX ) from pyam.read_ixmp import read_ix @@ -50,30 +50,29 @@ class IamDataFrame(object): It provides a number of diagnostic features (including validation of data, completeness of variables provided) as well as a number of visualization and plotting tools. - """ + Parameters + ---------- + data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file + an instance of an TimeSeries or Scenario (requires `ixmp`), + or pd.DataFrame or data file with IAMC-format data columns. + A pd.DataFrame can have the required data as columns or index. + kwargs: + if `value=col`, melt `col` to `value` and use `col` name as `variable`; + else, mapping of columns required for an `IamDataFrame` to: + - one column in `df` + - multiple columns, which will be concatenated by pipe + - a string to be used as value for this column + """ def __init__(self, data, **kwargs): - """Initialize an instance of an IamDataFrame - - Parameters - ---------- - data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file - an instance of an TimeSeries or Scenario (requires `ixmp`), - or pd.DataFrame or data file with IAMC-format data columns. - A pd.DataFrame can have the required data as columns or index. - - Special support is provided for data files downloaded directly from - IIASA SSP and RCP databases. If you run into any problems loading - data, please make an issue at: - https://github.com/IAMconsortium/pyam/issues - """ + """Initialize an instance of an IamDataFrame""" # import data from pd.DataFrame or read from source if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): - _data = format_data(data.copy()) + _data = format_data(data.copy(), **kwargs) elif has_ix and isinstance(data, ixmp.TimeSeries): _data = read_ix(data, **kwargs) else: - _data = read_files(data, **kwargs) + _data = read_file(data, **kwargs) self.data, self.time_col, self.extra_cols = _data # cast time_col to desired format @@ -181,6 +180,7 @@ def append(self, other, ignore_meta_conflict=False, inplace=False, any meta columns present in `self` and `other` are not identical. inplace : bool, default False If True, do operation inplace and return None + kwargs are passed through to `IamDataFrame(other, **kwargs)` """ if not isinstance(other, IamDataFrame): other = IamDataFrame(other, **kwargs) @@ -224,15 +224,14 @@ def append(self, other, ignore_meta_conflict=False, inplace=False, ret.meta = ret.meta.append(other.meta.loc[diff, :], **sort_kwarg) # append other.data (verify integrity for no duplicates) - ret.data.set_index(ret._LONG_IDX, inplace=True) - _other = other.data.set_index(other._LONG_IDX) - ret.data = ret.data.append(_other, verify_integrity=True)\ - .reset_index(drop=False) + _data = ret.data.set_index(ret._LONG_IDX).append( + other.data.set_index(other._LONG_IDX), verify_integrity=True) # merge extra columns in `data` and set `LONG_IDX` ret.extra_cols += [i for i in other.extra_cols if i not in ret.extra_cols] ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols + ret.data = sort_data(_data.reset_index(), ret._LONG_IDX) if not inplace: return ret @@ -1501,13 +1500,13 @@ def compare(left, right, left_label='left', right_label='right', def concat(dfs): """Concatenate a series of `pyam.IamDataFrame`-like objects together""" - if not hasattr(dfs, '__iter__'): - raise TypeError('Input data must be iterable (e.g., list or tuple)') + if isstr(dfs) or not hasattr(dfs, '__iter__'): + msg = 'Argument must be a non-string iterable (e.g., list or tuple)' + raise TypeError(msg) _df = None for df in dfs: - if not isinstance(df, IamDataFrame): - raise TypeError('Input contains non-`pyam.IamDataFrame`') + df = df if isinstance(df, IamDataFrame) else IamDataFrame(df) if _df is None: _df = copy.deepcopy(df) else: diff --git a/pyam/utils.py b/pyam/utils.py index 8a0a82c5c..7351fcdde 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -112,22 +112,63 @@ def read_pandas(fname, *args, **kwargs): return df -def read_files(fnames, *args, **kwargs): - """Read data from a snapshot file saved in the standard IAMC format +def read_file(fname, *args, **kwargs): + """Read data from a file saved in the standard IAMC format or a table with year/value columns """ - if not isstr(fnames): + if not isstr(fname): raise ValueError('reading multiple files not supported, ' 'please use `pyam.IamDataFrame.append()`') - logger().info('Reading `{}`'.format(fnames)) - return format_data(read_pandas(fnames, *args, **kwargs)) + logger().info('Reading `{}`'.format(fname)) + format_kwargs = {} + # extract kwargs that are intended for `format_data` + for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]: + format_kwargs[c] = kwargs.pop(c) + return format_data(read_pandas(fname, *args, **kwargs), **format_kwargs) -def format_data(df): - """Convert an imported dataframe and check all required columns""" +def format_data(df, **kwargs): + """Convert a `pd.Dataframe` or `pd.Series` to the required format""" if isinstance(df, pd.Series): df = df.to_frame() + # ensure that only either `value` or `variable` custom setting is used + _cols = ['value', 'variable'] + if any([i in kwargs for i in _cols]) and \ + all([i in kwargs or i in df.columns for i in _cols]): + raise ValueError('using both `value` and `variable` is not valid!') + + # if `value` arg is given, melt columns and use column name as `variable` + if 'value' in kwargs: + value = kwargs.pop('value') + idx = set(df.columns) & (set(IAMC_IDX) | set(['year', 'time'])) + _df = df.set_index(list(idx)) + print(_df) + dfs = [] + for v in value if islistable(value) else [value]: + if v not in df.columns: + raise ValueError('column `{}` does not exist!'.format(v)) + vdf = _df[v].to_frame().rename(columns={v: 'value'}) + vdf['variable'] = v + dfs.append(vdf.reset_index()) + df = pd.concat(dfs).reset_index(drop=True) + + # for other columns, do a rename or concat multiple columns to IAMC-style + for col, value in kwargs.items(): + if col in df: + raise ValueError('conflict of kwarg with column in dataframe!') + + if isstr(value) and value in df: + df.rename(columns={value: col}, inplace=True) + elif islistable(value) and all([c in df.columns for c in value]): + df[col] = df.apply(lambda x: concat_with_pipe(x, value), axis=1) + df.drop(value, axis=1, inplace=True) + elif isstr(value): + df[col] = value + else: + raise ValueError('invalid argument for casting `{}: {}`' + .format(col, value)) + # all lower case str_cols = [c for c in df.columns if isstr(c)] df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) @@ -157,9 +198,9 @@ def format_data(df): if 'value' in df.columns: # check if time column is given as `year` (int) or `time` (datetime) cols = df.columns - if 'year' in cols and 'time' not in cols: + if 'year' in cols: time_col = 'year' - elif 'time' in cols and 'year' not in cols: + elif 'time' in cols: time_col = 'time' else: msg = 'invalid time format, must have either `year` or `time`!' @@ -194,18 +235,18 @@ def format_data(df): # cast value columns to numeric, drop NaN's, sort data df['value'] = df['value'].astype('float64') df.dropna(inplace=True) - df.sort_values(META_IDX + ['variable', time_col, 'region'] + extra_cols, - inplace=True) - return df, time_col, extra_cols + # check for duplicates and return sorted data + idx_cols = IAMC_IDX + [time_col] + extra_cols + if any(df[idx_cols].duplicated()): + raise ValueError('duplicate rows in `data`!') + return sort_data(df, idx_cols), time_col, extra_cols -def style_df(df, style='heatmap'): - if style == 'highlight_not_max': - return df.style.apply(lambda s: ['' if v else 'background-color: yellow' for v in s == s.max()]) - if style == 'heatmap': - cm = sns.light_palette("green", as_cmap=True) - return df.style.background_gradient(cmap=cm) + +def sort_data(data, cols): + """Sort `data` rows and order columns""" + return data.sort_values(cols)[cols + ['value']].reset_index(drop=True) def find_depth(data, s='', level=None): @@ -395,3 +436,16 @@ def to_int(x, index=False): return x else: return _x + + +def concat_with_pipe(x, cols=None): + """Concatenate a `pd.Series` separated by `|`, drop `None` or `np.nan`""" + cols = cols or x.index + return '|'.join([x[i] for i in cols if x[i] not in [None, np.nan]]) + + +def reduce_hierarchy(x, depth): + """Reduce the hierarchy (depth by `|`) string to the specified level""" + _x = x.split('|') + depth = len(_x) + depth - 1 if depth < 0 else depth + return '|'.join(_x[0:(depth + 1)]) diff --git a/tests/test_cast_to_iamc.py b/tests/test_cast_to_iamc.py new file mode 100644 index 000000000..a53b91528 --- /dev/null +++ b/tests/test_cast_to_iamc.py @@ -0,0 +1,53 @@ +import pytest +import pandas as pd +from pyam import IamDataFrame, compare + + +def test_cast_from_value_col(meta_df): + df_with_value_cols = pd.DataFrame([ + ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5], + ['model_a', 'scen_a', 'World', 'EJ/y', 2010, 6., 3], + ['model_a', 'scen_b', 'World', 'EJ/y', 2005, 2, None], + ['model_a', 'scen_b', 'World', 'EJ/y', 2010, 7, None] + ], + columns=['model', 'scenario', 'region', 'unit', 'year', + 'Primary Energy', 'Primary Energy|Coal'], + ) + df = IamDataFrame(df_with_value_cols, + value=['Primary Energy', 'Primary Energy|Coal']) + + assert compare(meta_df, df).empty + pd.testing.assert_frame_equal(df.data, meta_df.data) + + +def test_cast_with_model_arg_raises(): + df = pd.DataFrame([ + ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5], + ], + columns=['model', 'scenario', 'region', 'unit', 'year', + 'Primary Energy', 'Primary Energy|Coal'], + ) + pytest.raises(ValueError, IamDataFrame, df, model='foo') + + +def test_cast_with_model_arg(meta_df): + df = meta_df.timeseries().reset_index() + df.rename(columns={'model': 'foo'}, inplace=True) + + df = IamDataFrame(df, model='foo') + assert compare(meta_df, df).empty + pd.testing.assert_frame_equal(df.data, meta_df.data) + + +def test_cast_by_column_concat(meta_df): + df = pd.DataFrame([ + ['scen_a', 'World', 'Primary Energy', None, 'EJ/y', 1, 6.], + ['scen_a', 'World', 'Primary Energy', 'Coal', 'EJ/y', 0.5, 3], + ['scen_b', 'World', 'Primary Energy', None, 'EJ/y', 2, 7], + ], + columns=['scenario', 'region', 'var_1', 'var_2', 'unit', 2005, 2010], + ) + + df = IamDataFrame(df, model='model_a', variable=['var_1', 'var_2']) + assert compare(meta_df, df).empty + pd.testing.assert_frame_equal(df.data, meta_df.data) diff --git a/tests/test_core.py b/tests/test_core.py index eb03c33c5..13c98e8dd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -8,7 +8,7 @@ from numpy import testing as npt from pyam import IamDataFrame, validate, categorize, \ - require_variable, filter_by_meta, META_IDX, IAMC_IDX + require_variable, filter_by_meta, META_IDX, IAMC_IDX, sort_data from pyam.core import _meta_idx, concat from conftest import TEST_DATA_DIR @@ -39,6 +39,12 @@ def test_init_df_with_float_cols_raises(test_pd_df): pytest.raises(ValueError, IamDataFrame, data=_test_df) +def test_init_df_with_duplicates_raises(test_df): + _df = test_df.timeseries() + _df = _df.append(_df.iloc[0]).reset_index() + pytest.raises(ValueError, IamDataFrame, data=_df) + + def test_init_df_with_float_cols(test_pd_df): _test_df = test_pd_df.rename(columns={2005: 2005., 2010: 2010.}) obs = IamDataFrame(_test_df).timeseries().reset_index() @@ -772,7 +778,7 @@ def test_filter_by_int(meta_df): def _r5_regions_exp(df): df = df.filter(region='World', keep=False) df['region'] = 'R5MAF' - return df.data.reset_index(drop=True) + return sort_data(df.data, df._LONG_IDX) def test_map_regions_r5(reg_df): @@ -841,7 +847,7 @@ def test_48b(): ['model', 'scen1', 'SDN', 'var', 'unit', 2, 7], ], columns=['model', 'scenario', 'region', 'variable', 'unit', 2005, 2010], - )).data.reset_index(drop=True) + )).data df = IamDataFrame(pd.DataFrame([ ['model', 'scen', 'R5MAF', 'var', 'unit', 1, 6], @@ -850,7 +856,7 @@ def test_48b(): 'variable', 'unit', 2005, 2010], )) obs = df.map_regions('iso', region_col='r5_region').data - obs = obs[obs.region.isin(['SSD', 'SDN'])].reset_index(drop=True) + obs = sort_data(obs[obs.region.isin(['SSD', 'SDN'])], df._LONG_IDX) pd.testing.assert_frame_equal(obs, exp, check_index_type=False) diff --git a/tests/test_utils.py b/tests/test_utils.py index 6484d2adc..ae66d2508 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,7 @@ from pyam import utils TEST_VARS = pd.Series(['foo', 'foo|bar', 'foo|bar|baz']) +TEST_CONCAT_SERIES = pd.Series(['foo', 'bar', 'baz'], index=['f', 'b', 'z']) def test_pattern_match_none(): @@ -145,3 +146,43 @@ def test_find_depth_1_minus(): def test_find_depth_1_plus(): obs = utils.find_depth(TEST_VARS, level='1+') assert obs == [False, True, True] + + +def test_concat_with_pipe_all(): + obs = utils.concat_with_pipe(TEST_CONCAT_SERIES) + assert obs == 'foo|bar|baz' + + +def test_concat_with_pipe_exclude_none(): + s = TEST_CONCAT_SERIES.copy() + s['b'] = None + obs = utils.concat_with_pipe(s) + assert obs == 'foo|baz' + + +def test_concat_with_pipe_exclude_nan(): + s = TEST_CONCAT_SERIES.copy() + s['b'] = np.nan + obs = utils.concat_with_pipe(s) + assert obs == 'foo|baz' + + +def test_concat_with_pipe_by_name(): + obs = utils.concat_with_pipe(TEST_CONCAT_SERIES, ['f', 'z']) + assert obs == 'foo|baz' + + +def test_reduce_hierarchy_0(): + assert utils.reduce_hierarchy('foo|bar|baz', 0) == 'foo' + + +def test_reduce_hierarchy_1(): + assert utils.reduce_hierarchy('foo|bar|baz', 1) == 'foo|bar' + + +def test_reduce_hierarchy_neg1(): + assert utils.reduce_hierarchy('foo|bar|baz', -1) == 'foo|bar' + + +def test_reduce_hierarchy_neg2(): + assert utils.reduce_hierarchy('foo|bar|baz', -2) == 'foo'