diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index fc0182d83..f89505224 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,3 +1,7 @@
+# Next release
+
+- [#416](https://github.com/IAMconsortium/pyam/pull/416) Include `meta` in new IamDataFrames returned by aggregation functions
+
# Release v0.7.0
## Highlights
diff --git a/pyam/core.py b/pyam/core.py
index 53f3be5d4..d757b7a6d 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -33,9 +33,9 @@
read_pandas,
format_data,
sort_data,
+ merge_meta,
to_int,
find_depth,
- reduce_hierarchy,
pattern_match,
years_match,
month_match,
@@ -76,14 +76,18 @@ class IamDataFrame(object):
Support is provided additionally for R-style data columns for years,
like "X2015", etc.
kwargs
- if `value=col`, melt column `col` to 'value' and use `col` name as
- 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to
+ If `value=
`, melt column `` to 'value' and use `` name
+ as 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to
any of the following:
- one column in `data`
- multiple columns, to be concatenated by :code:`|`
- a string to be used as value for this column
+ A :class:`pandas.DataFrame` with suitable `meta` indicators can be
+ passed as `meta=`. The index will be downselected to those
+ scenarios that have timeseries data.
+
Notes
-----
When initializing an :class:`IamDataFrame` from an xlsx file,
@@ -115,10 +119,14 @@ def _init(self, data, **kwargs):
"""Process data and set attributes for new instance"""
# import data from pd.DataFrame or read from source
if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
+ meta = kwargs.pop('meta') if 'meta' in kwargs else None
_data = format_data(data.copy(), **kwargs)
elif has_ix and isinstance(data, ixmp.TimeSeries):
+ # TODO read meta indicators from ixmp
+ meta = None
_data = read_ix(data, **kwargs)
else:
+ meta = None
logger.info('Reading file `{}`'.format(data))
_data = read_file(data, **kwargs)
@@ -135,6 +143,11 @@ def _init(self, data, **kwargs):
self.meta = self.data[META_IDX].drop_duplicates().set_index(META_IDX)
self.reset_exclude()
+ # merge meta dataframe (if given in kwargs)
+ if meta is not None:
+ self.meta = merge_meta(meta.loc[_make_index(self.data)],
+ self.meta, ignore_meta_conflict=True)
+
# if initializing from xlsx, try to load `meta` table from file
meta_sheet = kwargs.get('meta_sheet_name', 'meta')
if isstr(data) and data.endswith('.xlsx') and meta_sheet is not False\
@@ -254,8 +267,9 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
**kwargs):
"""Append any IamDataFrame-like object to this object
- Columns in `other.meta` that are not in `self.meta` are always merged,
- duplicate region-variable-unit-year rows raise a ValueError.
+ Indicators in `other.meta` that are not in `self.meta` are merged.
+ Missing values are set to `NaN`.
+ Conflicting `data` rows always raise a `ValueError`.
Parameters
----------
@@ -266,8 +280,9 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
any meta columns present in `self` and `other` are not identical.
inplace : bool, default False
if True, do operation inplace and return None
- kwargs : initializing other as IamDataFrame
+ kwargs
passed to :class:`IamDataFrame(other, **kwargs) `
+ if `other` is not already an IamDataFrame
"""
if not isinstance(other, IamDataFrame):
other = IamDataFrame(other, **kwargs)
@@ -278,41 +293,15 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
ret = self.copy() if not inplace else self
- diff = other.meta.index.difference(ret.meta.index)
- intersect = other.meta.index.intersection(ret.meta.index)
-
- # merge other.meta columns not in self.meta for existing scenarios
- if not intersect.empty:
- # if not ignored, check that overlapping meta dataframes are equal
- if not ignore_meta_conflict:
- cols = [i for i in other.meta.columns if i in ret.meta.columns]
- if not ret.meta.loc[intersect, cols].equals(
- other.meta.loc[intersect, cols]):
- conflict_idx = (
- pd.concat([ret.meta.loc[intersect, cols],
- other.meta.loc[intersect, cols]]
- ).drop_duplicates()
- .index.drop_duplicates()
- )
- msg = 'conflict in `meta` for scenarios {}'.format(
- [i for i in pd.DataFrame(index=conflict_idx).index])
- raise ValueError(msg)
-
- cols = [i for i in other.meta.columns if i not in ret.meta.columns]
- _meta = other.meta.loc[intersect, cols]
- ret.meta = ret.meta.merge(_meta, how='outer',
- left_index=True, right_index=True)
-
- # join other.meta for new scenarios
- if not diff.empty:
- ret.meta = ret.meta.append(other.meta.loc[diff, :], sort=False)
+ # merge `meta` tables
+ ret.meta = merge_meta(ret.meta, other.meta, ignore_meta_conflict)
# append other.data (verify integrity for no duplicates)
_data = ret.data.set_index(sorted(ret._LONG_IDX)).append(
other.data.set_index(sorted(other._LONG_IDX)),
verify_integrity=True)
- # merge extra columns in `data` and set `LONG_IDX`
+ # merge extra columns in `data` and set `self._LONG_IDX`
ret.extra_cols += [i for i in other.extra_cols
if i not in ret.extra_cols]
ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols
@@ -928,7 +917,7 @@ def aggregate(self, variable, components=None, method='sum',
if append is True:
self.append(_df, inplace=True)
else:
- return IamDataFrame(_df)
+ return IamDataFrame(_df, meta=self.meta)
def check_aggregate(self, variable, components=None, method='sum',
exclude_on_fail=False, multiplier=1, **kwargs):
@@ -1019,7 +1008,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
if append is True:
self.append(_df, region=region, inplace=True)
else:
- return IamDataFrame(_df, region=region)
+ return IamDataFrame(_df, region=region, meta=self.meta)
def check_aggregate_region(self, variable, region='World', subregions=None,
components=False, method='sum', weight=None,
@@ -1095,17 +1084,17 @@ def aggregate_time(self, variable, column='subannual', value='year',
----------
variable : str or list of str
variable(s) to be aggregated
- column : str, default 'subannual'
+ column : str, optional
the data column to be used as subannual time representation
- value : str, default 'year
+ value : str, optional
the name of the aggregated (subannual) time
components : list of str
subannual timeslices to be aggregated; defaults to all subannual
- timeslices other than ``value``
- method : func or str, default 'sum'
+ timeslices other than `value`
+ method : func or str, optional
method to use for aggregation,
e.g. :func:`numpy.mean`, :func:`numpy.sum`, 'min', 'max'
- append : bool, default False
+ append : bool, optional
append the aggregate timeseries to `self` and return None,
else return aggregate timeseries as new :class:`IamDataFrame`
"""
@@ -1120,9 +1109,7 @@ def aggregate_time(self, variable, column='subannual', value='year',
if append is True:
self.append(_df, inplace=True)
else:
- df = IamDataFrame(_df)
- df.meta = self.meta.loc[_make_index(df.data)]
- return df
+ return IamDataFrame(_df, meta=self.meta)
def downscale_region(self, variable, region='World', subregions=None,
proxy=None, weight=None, append=False):
@@ -1180,9 +1167,7 @@ def downscale_region(self, variable, region='World', subregions=None,
if append is True:
self.append(_data, inplace=True)
else:
- df = IamDataFrame(_data)
- df.meta = self.meta.loc[_make_index(df.data)]
- return df
+ return IamDataFrame(_data, meta=self.meta)
def _all_other_regions(self, region, variable=None):
"""Return list of regions other than `region` containing `variable`"""
@@ -1796,9 +1781,15 @@ def _apply_criteria(df, criteria, **kwargs):
def _make_index(df, cols=META_IDX):
- """Create an index from the columns of a dataframe"""
- return pd.MultiIndex.from_tuples(
- pd.unique(list(zip(*[df[col] for col in cols]))), names=tuple(cols))
+ """Create an index from the columns of a dataframe or series"""
+ def _get_col(c):
+ try:
+ return df.index.get_level_values(c)
+ except KeyError:
+ return df[c]
+
+ index = pd.unique(list(zip(*[_get_col(col) for col in cols])))
+ return pd.MultiIndex.from_tuples(index, names=tuple(cols))
def validate(df, criteria={}, exclude_on_fail=False, **kwargs):
diff --git a/pyam/utils.py b/pyam/utils.py
index 230d3a460..24a3674ce 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -280,6 +280,39 @@ def sort_data(data, cols):
return data.sort_values(cols)[cols + ['value']].reset_index(drop=True)
+def merge_meta(left, right, ignore_meta_conflict=False):
+ """Merge two `meta` tables; raise if values are in conflict (optional)
+
+ If conflicts are ignored, values in `left` take precedence over `right`.
+ """
+ left = left.copy() # make a copy to not change the original object
+ diff = right.index.difference(left.index)
+ sect = right.index.intersection(left.index)
+
+ # merge `right` into `left` for overlapping scenarios ( `sect`)
+ if not sect.empty:
+ # if not ignored, check that overlapping `meta` columns are equal
+ if not ignore_meta_conflict:
+ cols = [i for i in right.columns if i in left.columns]
+ if not left.loc[sect, cols].equals(right.loc[sect, cols]):
+ conflict_idx = (
+ pd.concat([right.loc[sect, cols], left.loc[sect, cols]])
+ .drop_duplicates().index.drop_duplicates()
+ )
+ msg = 'conflict in `meta` for scenarios {}'.format(
+ [i for i in pd.DataFrame(index=conflict_idx).index])
+ raise ValueError(msg)
+ # merge new columns
+ cols = [i for i in right.columns if i not in left.columns]
+ left = left.merge(right.loc[sect, cols], how='outer',
+ left_index=True, right_index=True)
+
+ # join `other.meta` for new scenarios (`diff`)
+ if not diff.empty:
+ left = left.append(right.loc[diff, :], sort=False)
+
+ return left
+
def find_depth(data, s='', level=None):
"""Return or assert the depth (number of ``|``) of variables
diff --git a/tests/conftest.py b/tests/conftest.py
index db833a983..6a8fde572 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,19 +3,20 @@
matplotlib.use('agg')
import os
-from requests.exceptions import SSLError
+from requests.exceptions import ConnectionError
import pytest
+import numpy as np
import pandas as pd
from datetime import datetime
-from pyam import IamDataFrame, IAMC_IDX, iiasa
+from pyam import IamDataFrame, META_IDX, IAMC_IDX, iiasa
# verify whether IIASA database API can be reached, skip tests otherwise
try:
iiasa.Connection()
IIASA_UNAVAILABLE = False
-except SSLError:
+except ConnectionError:
IIASA_UNAVAILABLE = True
TEST_API = 'integration-test'
@@ -43,6 +44,12 @@
columns=IAMC_IDX + TEST_YEARS,
)
+META_COLS = ['number', 'string']
+META_DF = pd.DataFrame([
+ ['model_a', 'scen_a', 1, 'foo'],
+ ['model_a', 'scen_b', 2, np.nan],
+], columns=META_IDX + META_COLS).set_index(META_IDX)
+
FULL_FEATURE_DF = pd.DataFrame([
['World', 'Primary Energy', 'EJ/yr', 12, 15],
@@ -120,6 +127,8 @@ def test_df(request):
tdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
axis="columns")
df = IamDataFrame(data=tdf)
+ for i in META_COLS:
+ df.set_meta(META_DF[i])
yield df
@@ -127,6 +136,8 @@ def test_df(request):
@pytest.fixture(scope="function")
def test_df_year():
df = IamDataFrame(data=TEST_DF)
+ for i in META_COLS:
+ df.set_meta(META_DF[i])
yield df
@@ -148,7 +159,9 @@ def simple_df(request):
_df = FULL_FEATURE_DF.copy()
if request.param == 'datetime':
_df.rename(DTS_MAPPING, axis="columns", inplace=True)
- yield IamDataFrame(model='model_a', scenario='scen_a', data=_df)
+ df = IamDataFrame(model='model_a', scenario='scen_a', data=_df)
+ df.set_meta('foo', 'string')
+ yield df
# IamDataFrame with subannual time resolution
@@ -165,8 +178,9 @@ def add_subannual(_data, name, value):
mapping = [('year', 1), ('winter', 0.7), ('summer', 0.3)]
lst = [add_subannual(_df.copy(), name, value) for name, value in mapping]
- yield IamDataFrame(model='model_a', scenario='scen_a', data=pd.concat(lst))
-
+ df = IamDataFrame(model='model_a', scenario='scen_a', data=pd.concat(lst))
+ df.set_meta('foo', 'string')
+ yield df
@pytest.fixture(scope="function")
def reg_df():
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index fd9a2d93b..1d667190e 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -65,7 +65,7 @@ def test_aggregate(simple_df, variable, data):
if simple_df.time_col == 'time':
_df.year = _df.year.replace(DTS_MAPPING)
_df.rename({'year': 'time'}, axis='columns', inplace=True)
- exp = IamDataFrame(_df)
+ exp = IamDataFrame(_df, meta=simple_df.meta)
for m in ['max', np.max]:
assert_iamframe_equal(simple_df.aggregate(variable, method=m), exp)
@@ -262,10 +262,11 @@ def test_aggregate_region_with_other_method(simple_df, variable, data):
if simple_df.time_col == 'time':
_df.year = _df.year.replace(DTS_MAPPING)
_df.rename({'year': 'time'}, axis='columns', inplace=True)
- exp = IamDataFrame(_df).filter(region='World')
+
+ exp = IamDataFrame(_df, meta=simple_df.meta).filter(region='World')
for m in ['max', np.max]:
- assert_iamframe_equal(simple_df.aggregate_region(variable, method=m),
- exp)
+ obs = simple_df.aggregate_region(variable, method=m)
+ assert_iamframe_equal(obs, exp)
def test_aggregate_region_with_components(simple_df):
diff --git a/tests/test_feature_append_rename.py b/tests/test_feature_append_rename.py
index 16e4cfd39..a50b7f1d3 100644
--- a/tests/test_feature_append_rename.py
+++ b/tests/test_feature_append_rename.py
@@ -7,7 +7,7 @@
from pyam import IamDataFrame, META_IDX, IAMC_IDX, compare
-from conftest import TEST_DTS
+from conftest import TEST_DTS, META_COLS
RENAME_DF = IamDataFrame(pd.DataFrame([
@@ -85,7 +85,8 @@ def test_append_same_scenario(test_df):
df = test_df.append(other, ignore_meta_conflict=True)
# check that the new meta.index is updated, but not the original one
- npt.assert_array_equal(test_df.meta.columns, ['exclude', 'col1'])
+ cols = ['exclude'] + META_COLS + ['col1']
+ npt.assert_array_equal(test_df.meta.columns, cols)
# assert that merging of meta works as expected
exp = test_df.meta.copy()
@@ -191,9 +192,9 @@ def test_rename_index(test_df):
# test meta changes
exp = pd.DataFrame([
- ['model_b', 'scen_c', False],
- ['model_a', 'scen_b', False],
- ], columns=['model', 'scenario', 'exclude']
+ ['model_b', 'scen_c', False, 1, 'foo'],
+ ['model_a', 'scen_b', False, 2, np.nan],
+ ], columns=['model', 'scenario', 'exclude'] + META_COLS
).set_index(META_IDX)
pd.testing.assert_frame_equal(obs.meta, exp)
@@ -222,10 +223,10 @@ def test_rename_append(test_df):
# test meta changes
exp = pd.DataFrame([
- ['model_a', 'scen_a', False],
- ['model_a', 'scen_b', False],
- ['model_b', 'scen_c', False],
- ], columns=['model', 'scenario', 'exclude']
+ ['model_a', 'scen_a', False, 1, 'foo'],
+ ['model_a', 'scen_b', False, 2, np.nan],
+ ['model_b', 'scen_c', False, 1, 'foo'],
+ ], columns=['model', 'scenario', 'exclude'] + META_COLS
).set_index(META_IDX)
pd.testing.assert_frame_equal(obs.meta, exp)
diff --git a/tests/test_iiasa.py b/tests/test_iiasa.py
index 6ae3ef43c..ef4eec24c 100644
--- a/tests/test_iiasa.py
+++ b/tests/test_iiasa.py
@@ -9,7 +9,7 @@
from pyam import IamDataFrame, iiasa, read_iiasa, META_IDX
from pyam.testing import assert_iamframe_equal
-from conftest import IIASA_UNAVAILABLE, TEST_API, TEST_API_NAME
+from conftest import IIASA_UNAVAILABLE, META_COLS, TEST_API, TEST_API_NAME
if IIASA_UNAVAILABLE:
pytest.skip('IIASA database API unavailable', allow_module_level=True)
@@ -23,7 +23,6 @@
)
VERSION_COLS = ['version', 'is_default']
-META_COLS = ['number', 'string']
META_DF = pd.DataFrame([
['model_a', 'scen_a', 1, True, 1, 'foo'],
['model_a', 'scen_b', 1, True, 2, np.nan],
diff --git a/tests/test_io.py b/tests/test_io.py
index e8380fd6e..4c51da765 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -60,16 +60,16 @@ def test_load_meta(test_df, args):
pd.testing.assert_series_equal(obs['category'], exp['category'])
-def test_load_ssp_database_downloaded_file(test_df_year):
- exp = test_df_year.filter(**FILTER_ARGS).as_pandas()
+def test_load_ssp_database_downloaded_file(test_pd_df):
+ exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
obs_df = IamDataFrame(os.path.join(
TEST_DATA_DIR, 'test_SSP_database_raw_download.xlsx')
)
pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)
-def test_load_rcp_database_downloaded_file(test_df_year):
- exp = test_df_year.filter(**FILTER_ARGS).as_pandas()
+def test_load_rcp_database_downloaded_file(test_pd_df):
+ exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
obs_df = IamDataFrame(os.path.join(
TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx')
)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ae66d2508..6c273a569 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,9 @@
+import pytest
import pandas as pd
import numpy as np
+from pandas import testing as pdt
-from pyam import utils
+from pyam import utils, META_IDX
TEST_VARS = pd.Series(['foo', 'foo|bar', 'foo|bar|baz'])
TEST_CONCAT_SERIES = pd.Series(['foo', 'bar', 'baz'], index=['f', 'b', 'z'])
@@ -186,3 +188,28 @@ def test_reduce_hierarchy_neg1():
def test_reduce_hierarchy_neg2():
assert utils.reduce_hierarchy('foo|bar|baz', -2) == 'foo'
+
+
+def test_merge_meta():
+ # test merging of two meta tables
+ left = pd.DataFrame([
+ ['model_a', 'scen_a', 'foo', 1],
+ ['model_a', 'scen_b', 'bar', 2],
+ ], columns=META_IDX + ['string', 'value']).set_index(META_IDX)
+ right = pd.DataFrame([
+ ['model_a', 'scen_a', 'bar', 2],
+ ['model_b', 'scen_a', 'baz', 3],
+ ], columns=META_IDX + ['string', 'value2']).set_index(META_IDX)
+
+ # merge conflict raises an error
+ pytest.raises(ValueError, utils.merge_meta, left, right)
+
+ # merge conflict ignoring errors yields expected results
+ exp = pd.DataFrame([
+ ['model_a', 'scen_a', 'foo', 1, 2],
+ ['model_a', 'scen_b', 'bar', 2, np.nan],
+ ['model_b', 'scen_a', 'baz', np.nan, 3],
+ ], columns=META_IDX + ['string', 'value', 'value2']).set_index(META_IDX)
+
+ obs = utils.merge_meta(left, right, ignore_meta_conflict=True)
+ pdt.assert_frame_equal(exp, obs)