diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index fc0182d83..f89505224 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,7 @@ +# Next release + +- [#416](https://github.com/IAMconsortium/pyam/pull/416) Include `meta` in new IamDataFrames returned by aggregation functions + # Release v0.7.0 ## Highlights diff --git a/pyam/core.py b/pyam/core.py index 53f3be5d4..d757b7a6d 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -33,9 +33,9 @@ read_pandas, format_data, sort_data, + merge_meta, to_int, find_depth, - reduce_hierarchy, pattern_match, years_match, month_match, @@ -76,14 +76,18 @@ class IamDataFrame(object): Support is provided additionally for R-style data columns for years, like "X2015", etc. kwargs - if `value=col`, melt column `col` to 'value' and use `col` name as - 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to + If `value=`, melt column `` to 'value' and use `` name + as 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to any of the following: - one column in `data` - multiple columns, to be concatenated by :code:`|` - a string to be used as value for this column + A :class:`pandas.DataFrame` with suitable `meta` indicators can be + passed as `meta=`. The index will be downselected to those + scenarios that have timeseries data. + Notes ----- When initializing an :class:`IamDataFrame` from an xlsx file, @@ -115,10 +119,14 @@ def _init(self, data, **kwargs): """Process data and set attributes for new instance""" # import data from pd.DataFrame or read from source if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): + meta = kwargs.pop('meta') if 'meta' in kwargs else None _data = format_data(data.copy(), **kwargs) elif has_ix and isinstance(data, ixmp.TimeSeries): + # TODO read meta indicators from ixmp + meta = None _data = read_ix(data, **kwargs) else: + meta = None logger.info('Reading file `{}`'.format(data)) _data = read_file(data, **kwargs) @@ -135,6 +143,11 @@ def _init(self, data, **kwargs): self.meta = self.data[META_IDX].drop_duplicates().set_index(META_IDX) self.reset_exclude() + # merge meta dataframe (if given in kwargs) + if meta is not None: + self.meta = merge_meta(meta.loc[_make_index(self.data)], + self.meta, ignore_meta_conflict=True) + # if initializing from xlsx, try to load `meta` table from file meta_sheet = kwargs.get('meta_sheet_name', 'meta') if isstr(data) and data.endswith('.xlsx') and meta_sheet is not False\ @@ -254,8 +267,9 @@ def append(self, other, ignore_meta_conflict=False, inplace=False, **kwargs): """Append any IamDataFrame-like object to this object - Columns in `other.meta` that are not in `self.meta` are always merged, - duplicate region-variable-unit-year rows raise a ValueError. + Indicators in `other.meta` that are not in `self.meta` are merged. + Missing values are set to `NaN`. + Conflicting `data` rows always raise a `ValueError`. Parameters ---------- @@ -266,8 +280,9 @@ def append(self, other, ignore_meta_conflict=False, inplace=False, any meta columns present in `self` and `other` are not identical. inplace : bool, default False if True, do operation inplace and return None - kwargs : initializing other as IamDataFrame + kwargs passed to :class:`IamDataFrame(other, **kwargs) ` + if `other` is not already an IamDataFrame """ if not isinstance(other, IamDataFrame): other = IamDataFrame(other, **kwargs) @@ -278,41 +293,15 @@ def append(self, other, ignore_meta_conflict=False, inplace=False, ret = self.copy() if not inplace else self - diff = other.meta.index.difference(ret.meta.index) - intersect = other.meta.index.intersection(ret.meta.index) - - # merge other.meta columns not in self.meta for existing scenarios - if not intersect.empty: - # if not ignored, check that overlapping meta dataframes are equal - if not ignore_meta_conflict: - cols = [i for i in other.meta.columns if i in ret.meta.columns] - if not ret.meta.loc[intersect, cols].equals( - other.meta.loc[intersect, cols]): - conflict_idx = ( - pd.concat([ret.meta.loc[intersect, cols], - other.meta.loc[intersect, cols]] - ).drop_duplicates() - .index.drop_duplicates() - ) - msg = 'conflict in `meta` for scenarios {}'.format( - [i for i in pd.DataFrame(index=conflict_idx).index]) - raise ValueError(msg) - - cols = [i for i in other.meta.columns if i not in ret.meta.columns] - _meta = other.meta.loc[intersect, cols] - ret.meta = ret.meta.merge(_meta, how='outer', - left_index=True, right_index=True) - - # join other.meta for new scenarios - if not diff.empty: - ret.meta = ret.meta.append(other.meta.loc[diff, :], sort=False) + # merge `meta` tables + ret.meta = merge_meta(ret.meta, other.meta, ignore_meta_conflict) # append other.data (verify integrity for no duplicates) _data = ret.data.set_index(sorted(ret._LONG_IDX)).append( other.data.set_index(sorted(other._LONG_IDX)), verify_integrity=True) - # merge extra columns in `data` and set `LONG_IDX` + # merge extra columns in `data` and set `self._LONG_IDX` ret.extra_cols += [i for i in other.extra_cols if i not in ret.extra_cols] ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols @@ -928,7 +917,7 @@ def aggregate(self, variable, components=None, method='sum', if append is True: self.append(_df, inplace=True) else: - return IamDataFrame(_df) + return IamDataFrame(_df, meta=self.meta) def check_aggregate(self, variable, components=None, method='sum', exclude_on_fail=False, multiplier=1, **kwargs): @@ -1019,7 +1008,7 @@ def aggregate_region(self, variable, region='World', subregions=None, if append is True: self.append(_df, region=region, inplace=True) else: - return IamDataFrame(_df, region=region) + return IamDataFrame(_df, region=region, meta=self.meta) def check_aggregate_region(self, variable, region='World', subregions=None, components=False, method='sum', weight=None, @@ -1095,17 +1084,17 @@ def aggregate_time(self, variable, column='subannual', value='year', ---------- variable : str or list of str variable(s) to be aggregated - column : str, default 'subannual' + column : str, optional the data column to be used as subannual time representation - value : str, default 'year + value : str, optional the name of the aggregated (subannual) time components : list of str subannual timeslices to be aggregated; defaults to all subannual - timeslices other than ``value`` - method : func or str, default 'sum' + timeslices other than `value` + method : func or str, optional method to use for aggregation, e.g. :func:`numpy.mean`, :func:`numpy.sum`, 'min', 'max' - append : bool, default False + append : bool, optional append the aggregate timeseries to `self` and return None, else return aggregate timeseries as new :class:`IamDataFrame` """ @@ -1120,9 +1109,7 @@ def aggregate_time(self, variable, column='subannual', value='year', if append is True: self.append(_df, inplace=True) else: - df = IamDataFrame(_df) - df.meta = self.meta.loc[_make_index(df.data)] - return df + return IamDataFrame(_df, meta=self.meta) def downscale_region(self, variable, region='World', subregions=None, proxy=None, weight=None, append=False): @@ -1180,9 +1167,7 @@ def downscale_region(self, variable, region='World', subregions=None, if append is True: self.append(_data, inplace=True) else: - df = IamDataFrame(_data) - df.meta = self.meta.loc[_make_index(df.data)] - return df + return IamDataFrame(_data, meta=self.meta) def _all_other_regions(self, region, variable=None): """Return list of regions other than `region` containing `variable`""" @@ -1796,9 +1781,15 @@ def _apply_criteria(df, criteria, **kwargs): def _make_index(df, cols=META_IDX): - """Create an index from the columns of a dataframe""" - return pd.MultiIndex.from_tuples( - pd.unique(list(zip(*[df[col] for col in cols]))), names=tuple(cols)) + """Create an index from the columns of a dataframe or series""" + def _get_col(c): + try: + return df.index.get_level_values(c) + except KeyError: + return df[c] + + index = pd.unique(list(zip(*[_get_col(col) for col in cols]))) + return pd.MultiIndex.from_tuples(index, names=tuple(cols)) def validate(df, criteria={}, exclude_on_fail=False, **kwargs): diff --git a/pyam/utils.py b/pyam/utils.py index 230d3a460..24a3674ce 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -280,6 +280,39 @@ def sort_data(data, cols): return data.sort_values(cols)[cols + ['value']].reset_index(drop=True) +def merge_meta(left, right, ignore_meta_conflict=False): + """Merge two `meta` tables; raise if values are in conflict (optional) + + If conflicts are ignored, values in `left` take precedence over `right`. + """ + left = left.copy() # make a copy to not change the original object + diff = right.index.difference(left.index) + sect = right.index.intersection(left.index) + + # merge `right` into `left` for overlapping scenarios ( `sect`) + if not sect.empty: + # if not ignored, check that overlapping `meta` columns are equal + if not ignore_meta_conflict: + cols = [i for i in right.columns if i in left.columns] + if not left.loc[sect, cols].equals(right.loc[sect, cols]): + conflict_idx = ( + pd.concat([right.loc[sect, cols], left.loc[sect, cols]]) + .drop_duplicates().index.drop_duplicates() + ) + msg = 'conflict in `meta` for scenarios {}'.format( + [i for i in pd.DataFrame(index=conflict_idx).index]) + raise ValueError(msg) + # merge new columns + cols = [i for i in right.columns if i not in left.columns] + left = left.merge(right.loc[sect, cols], how='outer', + left_index=True, right_index=True) + + # join `other.meta` for new scenarios (`diff`) + if not diff.empty: + left = left.append(right.loc[diff, :], sort=False) + + return left + def find_depth(data, s='', level=None): """Return or assert the depth (number of ``|``) of variables diff --git a/tests/conftest.py b/tests/conftest.py index db833a983..6a8fde572 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,19 +3,20 @@ matplotlib.use('agg') import os -from requests.exceptions import SSLError +from requests.exceptions import ConnectionError import pytest +import numpy as np import pandas as pd from datetime import datetime -from pyam import IamDataFrame, IAMC_IDX, iiasa +from pyam import IamDataFrame, META_IDX, IAMC_IDX, iiasa # verify whether IIASA database API can be reached, skip tests otherwise try: iiasa.Connection() IIASA_UNAVAILABLE = False -except SSLError: +except ConnectionError: IIASA_UNAVAILABLE = True TEST_API = 'integration-test' @@ -43,6 +44,12 @@ columns=IAMC_IDX + TEST_YEARS, ) +META_COLS = ['number', 'string'] +META_DF = pd.DataFrame([ + ['model_a', 'scen_a', 1, 'foo'], + ['model_a', 'scen_b', 2, np.nan], +], columns=META_IDX + META_COLS).set_index(META_IDX) + FULL_FEATURE_DF = pd.DataFrame([ ['World', 'Primary Energy', 'EJ/yr', 12, 15], @@ -120,6 +127,8 @@ def test_df(request): tdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]}, axis="columns") df = IamDataFrame(data=tdf) + for i in META_COLS: + df.set_meta(META_DF[i]) yield df @@ -127,6 +136,8 @@ def test_df(request): @pytest.fixture(scope="function") def test_df_year(): df = IamDataFrame(data=TEST_DF) + for i in META_COLS: + df.set_meta(META_DF[i]) yield df @@ -148,7 +159,9 @@ def simple_df(request): _df = FULL_FEATURE_DF.copy() if request.param == 'datetime': _df.rename(DTS_MAPPING, axis="columns", inplace=True) - yield IamDataFrame(model='model_a', scenario='scen_a', data=_df) + df = IamDataFrame(model='model_a', scenario='scen_a', data=_df) + df.set_meta('foo', 'string') + yield df # IamDataFrame with subannual time resolution @@ -165,8 +178,9 @@ def add_subannual(_data, name, value): mapping = [('year', 1), ('winter', 0.7), ('summer', 0.3)] lst = [add_subannual(_df.copy(), name, value) for name, value in mapping] - yield IamDataFrame(model='model_a', scenario='scen_a', data=pd.concat(lst)) - + df = IamDataFrame(model='model_a', scenario='scen_a', data=pd.concat(lst)) + df.set_meta('foo', 'string') + yield df @pytest.fixture(scope="function") def reg_df(): diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py index fd9a2d93b..1d667190e 100644 --- a/tests/test_feature_aggregate.py +++ b/tests/test_feature_aggregate.py @@ -65,7 +65,7 @@ def test_aggregate(simple_df, variable, data): if simple_df.time_col == 'time': _df.year = _df.year.replace(DTS_MAPPING) _df.rename({'year': 'time'}, axis='columns', inplace=True) - exp = IamDataFrame(_df) + exp = IamDataFrame(_df, meta=simple_df.meta) for m in ['max', np.max]: assert_iamframe_equal(simple_df.aggregate(variable, method=m), exp) @@ -262,10 +262,11 @@ def test_aggregate_region_with_other_method(simple_df, variable, data): if simple_df.time_col == 'time': _df.year = _df.year.replace(DTS_MAPPING) _df.rename({'year': 'time'}, axis='columns', inplace=True) - exp = IamDataFrame(_df).filter(region='World') + + exp = IamDataFrame(_df, meta=simple_df.meta).filter(region='World') for m in ['max', np.max]: - assert_iamframe_equal(simple_df.aggregate_region(variable, method=m), - exp) + obs = simple_df.aggregate_region(variable, method=m) + assert_iamframe_equal(obs, exp) def test_aggregate_region_with_components(simple_df): diff --git a/tests/test_feature_append_rename.py b/tests/test_feature_append_rename.py index 16e4cfd39..a50b7f1d3 100644 --- a/tests/test_feature_append_rename.py +++ b/tests/test_feature_append_rename.py @@ -7,7 +7,7 @@ from pyam import IamDataFrame, META_IDX, IAMC_IDX, compare -from conftest import TEST_DTS +from conftest import TEST_DTS, META_COLS RENAME_DF = IamDataFrame(pd.DataFrame([ @@ -85,7 +85,8 @@ def test_append_same_scenario(test_df): df = test_df.append(other, ignore_meta_conflict=True) # check that the new meta.index is updated, but not the original one - npt.assert_array_equal(test_df.meta.columns, ['exclude', 'col1']) + cols = ['exclude'] + META_COLS + ['col1'] + npt.assert_array_equal(test_df.meta.columns, cols) # assert that merging of meta works as expected exp = test_df.meta.copy() @@ -191,9 +192,9 @@ def test_rename_index(test_df): # test meta changes exp = pd.DataFrame([ - ['model_b', 'scen_c', False], - ['model_a', 'scen_b', False], - ], columns=['model', 'scenario', 'exclude'] + ['model_b', 'scen_c', False, 1, 'foo'], + ['model_a', 'scen_b', False, 2, np.nan], + ], columns=['model', 'scenario', 'exclude'] + META_COLS ).set_index(META_IDX) pd.testing.assert_frame_equal(obs.meta, exp) @@ -222,10 +223,10 @@ def test_rename_append(test_df): # test meta changes exp = pd.DataFrame([ - ['model_a', 'scen_a', False], - ['model_a', 'scen_b', False], - ['model_b', 'scen_c', False], - ], columns=['model', 'scenario', 'exclude'] + ['model_a', 'scen_a', False, 1, 'foo'], + ['model_a', 'scen_b', False, 2, np.nan], + ['model_b', 'scen_c', False, 1, 'foo'], + ], columns=['model', 'scenario', 'exclude'] + META_COLS ).set_index(META_IDX) pd.testing.assert_frame_equal(obs.meta, exp) diff --git a/tests/test_iiasa.py b/tests/test_iiasa.py index 6ae3ef43c..ef4eec24c 100644 --- a/tests/test_iiasa.py +++ b/tests/test_iiasa.py @@ -9,7 +9,7 @@ from pyam import IamDataFrame, iiasa, read_iiasa, META_IDX from pyam.testing import assert_iamframe_equal -from conftest import IIASA_UNAVAILABLE, TEST_API, TEST_API_NAME +from conftest import IIASA_UNAVAILABLE, META_COLS, TEST_API, TEST_API_NAME if IIASA_UNAVAILABLE: pytest.skip('IIASA database API unavailable', allow_module_level=True) @@ -23,7 +23,6 @@ ) VERSION_COLS = ['version', 'is_default'] -META_COLS = ['number', 'string'] META_DF = pd.DataFrame([ ['model_a', 'scen_a', 1, True, 1, 'foo'], ['model_a', 'scen_b', 1, True, 2, np.nan], diff --git a/tests/test_io.py b/tests/test_io.py index e8380fd6e..4c51da765 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -60,16 +60,16 @@ def test_load_meta(test_df, args): pd.testing.assert_series_equal(obs['category'], exp['category']) -def test_load_ssp_database_downloaded_file(test_df_year): - exp = test_df_year.filter(**FILTER_ARGS).as_pandas() +def test_load_ssp_database_downloaded_file(test_pd_df): + exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() obs_df = IamDataFrame(os.path.join( TEST_DATA_DIR, 'test_SSP_database_raw_download.xlsx') ) pd.testing.assert_frame_equal(obs_df.as_pandas(), exp) -def test_load_rcp_database_downloaded_file(test_df_year): - exp = test_df_year.filter(**FILTER_ARGS).as_pandas() +def test_load_rcp_database_downloaded_file(test_pd_df): + exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() obs_df = IamDataFrame(os.path.join( TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx') ) diff --git a/tests/test_utils.py b/tests/test_utils.py index ae66d2508..6c273a569 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ +import pytest import pandas as pd import numpy as np +from pandas import testing as pdt -from pyam import utils +from pyam import utils, META_IDX TEST_VARS = pd.Series(['foo', 'foo|bar', 'foo|bar|baz']) TEST_CONCAT_SERIES = pd.Series(['foo', 'bar', 'baz'], index=['f', 'b', 'z']) @@ -186,3 +188,28 @@ def test_reduce_hierarchy_neg1(): def test_reduce_hierarchy_neg2(): assert utils.reduce_hierarchy('foo|bar|baz', -2) == 'foo' + + +def test_merge_meta(): + # test merging of two meta tables + left = pd.DataFrame([ + ['model_a', 'scen_a', 'foo', 1], + ['model_a', 'scen_b', 'bar', 2], + ], columns=META_IDX + ['string', 'value']).set_index(META_IDX) + right = pd.DataFrame([ + ['model_a', 'scen_a', 'bar', 2], + ['model_b', 'scen_a', 'baz', 3], + ], columns=META_IDX + ['string', 'value2']).set_index(META_IDX) + + # merge conflict raises an error + pytest.raises(ValueError, utils.merge_meta, left, right) + + # merge conflict ignoring errors yields expected results + exp = pd.DataFrame([ + ['model_a', 'scen_a', 'foo', 1, 2], + ['model_a', 'scen_b', 'bar', 2, np.nan], + ['model_b', 'scen_a', 'baz', np.nan, 3], + ], columns=META_IDX + ['string', 'value', 'value2']).set_index(META_IDX) + + obs = utils.merge_meta(left, right, ignore_meta_conflict=True) + pdt.assert_frame_equal(exp, obs)