Skip to content

Commit

Permalink
Initialize an IamDataFrame from pd.DataFrame with formatting specs (
Browse files Browse the repository at this point in the history
#199)

* initial impl for giving default pyam dataframes

* move again into core

* small fixes

* update concat to try to cast to dataframe

* appease stickler

* fix `TypeError` in `concat()`

* add `concat_with_pipe()` in `utils` with some tests

* cast elements in `concat_with_pipe()` to `str`

* drop `nan` instead of casting to `str`

* insert function `reduce_hierarchy()` and add tests

* raise error if initializing `IamDataFrame`with duplicate rows in `data`

* remove unused legacy function

* add auxiliary function `sort_data()` for consistent ordering of cols

* use `sort_data()` in `append()`

* add `sort_data()` to some unit tests to make them pass

* refactor and add first test for `df_to_pyam()`

* clean-up before stickler gives me an earful...

* clean-up of implementation of `df_to_pyam()`

* update docstrings

* allow input-df with columns `year` and `time`

* defining `defaults` doesn't seem to be necessary

* require `value` arg in `df_to_pyam()` instead of using all columns

* check for conflicts with existing columns, add test

* add option in `df_to_pyam()` to rename cols

* add feature to concat required column from multiple given columns

* make `cast_to_iam` partof `format_data()`

* rename function to `read_file()` because it can only read one file

* pass kwargs for `format_data` through `read_file()`

* pep8 and docstring cleanup

* add to release notes

* minor edits as requested by @gidden in the review
  • Loading branch information
danielhuppmann authored and gidden committed Feb 22, 2019
1 parent 5a6992d commit fcb81fe
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 49 deletions.
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

# Next Release

- [#199](https://github.com/IAMconsortium/pyam/pull/199) Initializing an `IamDataFrame` accepts kwargs to fill or create from the data any missing required columns
- [#195](https://github.com/IAMconsortium/pyam/pull/195) Fix filtering for `time`, `day` and `hour` to use generic `pattern_match()` (if such a column exists) in 'year'-formmatted IamDataFrames
- [#192](https://github.com/IAMconsortium/pyam/pull/192) Extend `utils.find_depth()` to optionally return depth (as list of ints) rather than assert level tests
- [#190](https://github.com/IAMconsortium/pyam/pull/190) Add `concat()` function
Expand Down
53 changes: 26 additions & 27 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from pyam.run_control import run_control
from pyam.utils import (
write_sheet,
read_files,
read_file,
read_pandas,
format_data,
sort_data,
to_int,
find_depth,
pattern_match,
Expand All @@ -38,7 +39,6 @@
REGION_IDX,
IAMC_IDX,
SORT_IDX,
LONG_IDX,
GROUP_IDX
)
from pyam.read_ixmp import read_ix
Expand All @@ -50,30 +50,29 @@ class IamDataFrame(object):
It provides a number of diagnostic features (including validation of data,
completeness of variables provided) as well as a number of visualization
and plotting tools.
"""
Parameters
----------
data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file
an instance of an TimeSeries or Scenario (requires `ixmp`),
or pd.DataFrame or data file with IAMC-format data columns.
A pd.DataFrame can have the required data as columns or index.
kwargs:
if `value=col`, melt `col` to `value` and use `col` name as `variable`;
else, mapping of columns required for an `IamDataFrame` to:
- one column in `df`
- multiple columns, which will be concatenated by pipe
- a string to be used as value for this column
"""
def __init__(self, data, **kwargs):
"""Initialize an instance of an IamDataFrame
Parameters
----------
data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file
an instance of an TimeSeries or Scenario (requires `ixmp`),
or pd.DataFrame or data file with IAMC-format data columns.
A pd.DataFrame can have the required data as columns or index.
Special support is provided for data files downloaded directly from
IIASA SSP and RCP databases. If you run into any problems loading
data, please make an issue at:
https://github.com/IAMconsortium/pyam/issues
"""
"""Initialize an instance of an IamDataFrame"""
# import data from pd.DataFrame or read from source
if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
_data = format_data(data.copy())
_data = format_data(data.copy(), **kwargs)
elif has_ix and isinstance(data, ixmp.TimeSeries):
_data = read_ix(data, **kwargs)
else:
_data = read_files(data, **kwargs)
_data = read_file(data, **kwargs)

self.data, self.time_col, self.extra_cols = _data
# cast time_col to desired format
Expand Down Expand Up @@ -181,6 +180,7 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
any meta columns present in `self` and `other` are not identical.
inplace : bool, default False
If True, do operation inplace and return None
kwargs are passed through to `IamDataFrame(other, **kwargs)`
"""
if not isinstance(other, IamDataFrame):
other = IamDataFrame(other, **kwargs)
Expand Down Expand Up @@ -224,15 +224,14 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
ret.meta = ret.meta.append(other.meta.loc[diff, :], **sort_kwarg)

# append other.data (verify integrity for no duplicates)
ret.data.set_index(ret._LONG_IDX, inplace=True)
_other = other.data.set_index(other._LONG_IDX)
ret.data = ret.data.append(_other, verify_integrity=True)\
.reset_index(drop=False)
_data = ret.data.set_index(ret._LONG_IDX).append(
other.data.set_index(other._LONG_IDX), verify_integrity=True)

# merge extra columns in `data` and set `LONG_IDX`
ret.extra_cols += [i for i in other.extra_cols
if i not in ret.extra_cols]
ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols
ret.data = sort_data(_data.reset_index(), ret._LONG_IDX)

if not inplace:
return ret
Expand Down Expand Up @@ -1501,13 +1500,13 @@ def compare(left, right, left_label='left', right_label='right',

def concat(dfs):
"""Concatenate a series of `pyam.IamDataFrame`-like objects together"""
if not hasattr(dfs, '__iter__'):
raise TypeError('Input data must be iterable (e.g., list or tuple)')
if isstr(dfs) or not hasattr(dfs, '__iter__'):
msg = 'Argument must be a non-string iterable (e.g., list or tuple)'
raise TypeError(msg)

_df = None
for df in dfs:
if not isinstance(df, IamDataFrame):
raise TypeError('Input contains non-`pyam.IamDataFrame`')
df = df if isinstance(df, IamDataFrame) else IamDataFrame(df)
if _df is None:
_df = copy.deepcopy(df)
else:
Expand Down
90 changes: 72 additions & 18 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,22 +112,63 @@ def read_pandas(fname, *args, **kwargs):
return df


def read_files(fnames, *args, **kwargs):
"""Read data from a snapshot file saved in the standard IAMC format
def read_file(fname, *args, **kwargs):
"""Read data from a file saved in the standard IAMC format
or a table with year/value columns
"""
if not isstr(fnames):
if not isstr(fname):
raise ValueError('reading multiple files not supported, '
'please use `pyam.IamDataFrame.append()`')
logger().info('Reading `{}`'.format(fnames))
return format_data(read_pandas(fnames, *args, **kwargs))
logger().info('Reading `{}`'.format(fname))
format_kwargs = {}
# extract kwargs that are intended for `format_data`
for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]:
format_kwargs[c] = kwargs.pop(c)
return format_data(read_pandas(fname, *args, **kwargs), **format_kwargs)


def format_data(df):
"""Convert an imported dataframe and check all required columns"""
def format_data(df, **kwargs):
"""Convert a `pd.Dataframe` or `pd.Series` to the required format"""
if isinstance(df, pd.Series):
df = df.to_frame()

# ensure that only either `value` or `variable` custom setting is used
_cols = ['value', 'variable']
if any([i in kwargs for i in _cols]) and \
all([i in kwargs or i in df.columns for i in _cols]):
raise ValueError('using both `value` and `variable` is not valid!')

# if `value` arg is given, melt columns and use column name as `variable`
if 'value' in kwargs:
value = kwargs.pop('value')
idx = set(df.columns) & (set(IAMC_IDX) | set(['year', 'time']))
_df = df.set_index(list(idx))
print(_df)
dfs = []
for v in value if islistable(value) else [value]:
if v not in df.columns:
raise ValueError('column `{}` does not exist!'.format(v))
vdf = _df[v].to_frame().rename(columns={v: 'value'})
vdf['variable'] = v
dfs.append(vdf.reset_index())
df = pd.concat(dfs).reset_index(drop=True)

# for other columns, do a rename or concat multiple columns to IAMC-style
for col, value in kwargs.items():
if col in df:
raise ValueError('conflict of kwarg with column in dataframe!')

if isstr(value) and value in df:
df.rename(columns={value: col}, inplace=True)
elif islistable(value) and all([c in df.columns for c in value]):
df[col] = df.apply(lambda x: concat_with_pipe(x, value), axis=1)
df.drop(value, axis=1, inplace=True)
elif isstr(value):
df[col] = value
else:
raise ValueError('invalid argument for casting `{}: {}`'
.format(col, value))

# all lower case
str_cols = [c for c in df.columns if isstr(c)]
df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
Expand Down Expand Up @@ -157,9 +198,9 @@ def format_data(df):
if 'value' in df.columns:
# check if time column is given as `year` (int) or `time` (datetime)
cols = df.columns
if 'year' in cols and 'time' not in cols:
if 'year' in cols:
time_col = 'year'
elif 'time' in cols and 'year' not in cols:
elif 'time' in cols:
time_col = 'time'
else:
msg = 'invalid time format, must have either `year` or `time`!'
Expand Down Expand Up @@ -194,18 +235,18 @@ def format_data(df):
# cast value columns to numeric, drop NaN's, sort data
df['value'] = df['value'].astype('float64')
df.dropna(inplace=True)
df.sort_values(META_IDX + ['variable', time_col, 'region'] + extra_cols,
inplace=True)

return df, time_col, extra_cols
# check for duplicates and return sorted data
idx_cols = IAMC_IDX + [time_col] + extra_cols
if any(df[idx_cols].duplicated()):
raise ValueError('duplicate rows in `data`!')

return sort_data(df, idx_cols), time_col, extra_cols

def style_df(df, style='heatmap'):
if style == 'highlight_not_max':
return df.style.apply(lambda s: ['' if v else 'background-color: yellow' for v in s == s.max()])
if style == 'heatmap':
cm = sns.light_palette("green", as_cmap=True)
return df.style.background_gradient(cmap=cm)

def sort_data(data, cols):
"""Sort `data` rows and order columns"""
return data.sort_values(cols)[cols + ['value']].reset_index(drop=True)


def find_depth(data, s='', level=None):
Expand Down Expand Up @@ -395,3 +436,16 @@ def to_int(x, index=False):
return x
else:
return _x


def concat_with_pipe(x, cols=None):
"""Concatenate a `pd.Series` separated by `|`, drop `None` or `np.nan`"""
cols = cols or x.index
return '|'.join([x[i] for i in cols if x[i] not in [None, np.nan]])


def reduce_hierarchy(x, depth):
"""Reduce the hierarchy (depth by `|`) string to the specified level"""
_x = x.split('|')
depth = len(_x) + depth - 1 if depth < 0 else depth
return '|'.join(_x[0:(depth + 1)])
53 changes: 53 additions & 0 deletions tests/test_cast_to_iamc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
import pandas as pd
from pyam import IamDataFrame, compare


def test_cast_from_value_col(meta_df):
df_with_value_cols = pd.DataFrame([
['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
['model_a', 'scen_a', 'World', 'EJ/y', 2010, 6., 3],
['model_a', 'scen_b', 'World', 'EJ/y', 2005, 2, None],
['model_a', 'scen_b', 'World', 'EJ/y', 2010, 7, None]
],
columns=['model', 'scenario', 'region', 'unit', 'year',
'Primary Energy', 'Primary Energy|Coal'],
)
df = IamDataFrame(df_with_value_cols,
value=['Primary Energy', 'Primary Energy|Coal'])

assert compare(meta_df, df).empty
pd.testing.assert_frame_equal(df.data, meta_df.data)


def test_cast_with_model_arg_raises():
df = pd.DataFrame([
['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
],
columns=['model', 'scenario', 'region', 'unit', 'year',
'Primary Energy', 'Primary Energy|Coal'],
)
pytest.raises(ValueError, IamDataFrame, df, model='foo')


def test_cast_with_model_arg(meta_df):
df = meta_df.timeseries().reset_index()
df.rename(columns={'model': 'foo'}, inplace=True)

df = IamDataFrame(df, model='foo')
assert compare(meta_df, df).empty
pd.testing.assert_frame_equal(df.data, meta_df.data)


def test_cast_by_column_concat(meta_df):
df = pd.DataFrame([
['scen_a', 'World', 'Primary Energy', None, 'EJ/y', 1, 6.],
['scen_a', 'World', 'Primary Energy', 'Coal', 'EJ/y', 0.5, 3],
['scen_b', 'World', 'Primary Energy', None, 'EJ/y', 2, 7],
],
columns=['scenario', 'region', 'var_1', 'var_2', 'unit', 2005, 2010],
)

df = IamDataFrame(df, model='model_a', variable=['var_1', 'var_2'])
assert compare(meta_df, df).empty
pd.testing.assert_frame_equal(df.data, meta_df.data)
14 changes: 10 additions & 4 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from numpy import testing as npt

from pyam import IamDataFrame, validate, categorize, \
require_variable, filter_by_meta, META_IDX, IAMC_IDX
require_variable, filter_by_meta, META_IDX, IAMC_IDX, sort_data
from pyam.core import _meta_idx, concat

from conftest import TEST_DATA_DIR
Expand Down Expand Up @@ -39,6 +39,12 @@ def test_init_df_with_float_cols_raises(test_pd_df):
pytest.raises(ValueError, IamDataFrame, data=_test_df)


def test_init_df_with_duplicates_raises(test_df):
_df = test_df.timeseries()
_df = _df.append(_df.iloc[0]).reset_index()
pytest.raises(ValueError, IamDataFrame, data=_df)


def test_init_df_with_float_cols(test_pd_df):
_test_df = test_pd_df.rename(columns={2005: 2005., 2010: 2010.})
obs = IamDataFrame(_test_df).timeseries().reset_index()
Expand Down Expand Up @@ -772,7 +778,7 @@ def test_filter_by_int(meta_df):
def _r5_regions_exp(df):
df = df.filter(region='World', keep=False)
df['region'] = 'R5MAF'
return df.data.reset_index(drop=True)
return sort_data(df.data, df._LONG_IDX)


def test_map_regions_r5(reg_df):
Expand Down Expand Up @@ -841,7 +847,7 @@ def test_48b():
['model', 'scen1', 'SDN', 'var', 'unit', 2, 7],
], columns=['model', 'scenario', 'region',
'variable', 'unit', 2005, 2010],
)).data.reset_index(drop=True)
)).data

df = IamDataFrame(pd.DataFrame([
['model', 'scen', 'R5MAF', 'var', 'unit', 1, 6],
Expand All @@ -850,7 +856,7 @@ def test_48b():
'variable', 'unit', 2005, 2010],
))
obs = df.map_regions('iso', region_col='r5_region').data
obs = obs[obs.region.isin(['SSD', 'SDN'])].reset_index(drop=True)
obs = sort_data(obs[obs.region.isin(['SSD', 'SDN'])], df._LONG_IDX)

pd.testing.assert_frame_equal(obs, exp, check_index_type=False)

Expand Down
Loading

0 comments on commit fcb81fe

Please sign in to comment.