Skip to content

Commit

Permalink
Refactor initialization for simpler maintenance (#730)
Browse files Browse the repository at this point in the history
* Split format_data into functions

Co-authored-by: Matthew Gidden <[email protected]>

* Reorder format_data

Co-authored-by: Matthew Gidden <[email protected]>

* Split out format_data_to_series from format_data

* Fix unused import

* Avoid copying data

Co-authored-by: Matthew Gidden <[email protected]>

* Add function doc-strings

* Update release notes

* Apply suggestions from code review

Co-authored-by: Daniel Huppmann <[email protected]>

* Update pyam/utils.py

---------

Co-authored-by: Matthew Gidden <[email protected]>
Co-authored-by: Daniel Huppmann <[email protected]>
  • Loading branch information
3 people authored Feb 24, 2023
1 parent 7a97516 commit 128fb19
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 58 deletions.
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Next Release

- [#727](https://github.com/IAMconsortium/pyam/pull/730) Refactor initialization code
- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization
- [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute

Expand Down
4 changes: 2 additions & 2 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
_data = read_file(data, index=index, **kwargs)

# cast data from pandas
elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
_data = format_data(data.copy(), index=index, **kwargs)
elif isinstance(data, (pd.DataFrame, pd.Series)):
_data = format_data(data, index=index, **kwargs)

# unsupported `data` args
elif islistable(data):
Expand Down
139 changes: 83 additions & 56 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
import pandas as pd
from collections.abc import Iterable

try:
import seaborn as sns
except ImportError:
pass

logger = logging.getLogger(__name__)

# common indices
Expand Down Expand Up @@ -181,14 +176,10 @@ def read_file(path, *args, **kwargs):
return format_data(read_pandas(path, *args, **kwargs), **format_kwargs)


def format_data(df, index, **kwargs):
"""Convert a pandas.Dataframe or pandas.Series to the required format"""
if isinstance(df, pd.Series):
df.name = df.name or "value"
df = df.to_frame()
def _convert_r_columns(df):
"""Check and convert R-style year columns"""

# check for R-style year columns, converting where necessary
def convert_r_columns(c):
def strip_R_integer_prefix(c):
try:
first = c[0]
second = c[1:]
Expand All @@ -204,7 +195,11 @@ def convert_r_columns(c):
pass
return c

df.columns = df.columns.map(convert_r_columns)
return df.set_axis(df.columns.map(strip_R_integer_prefix), axis="columns")


def _knead_data(df, **kwargs):
"""Replace, rename and concat according to user arguments"""

# if `value` is given but not `variable`,
# melt value columns and use column name as `variable`
Expand Down Expand Up @@ -236,26 +231,26 @@ def convert_r_columns(c):
else:
raise ValueError(f"Invalid argument for casting `{col}: {value}`")

# all lower case
str_cols = [c for c in df.columns if isstr(c)]
df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)

if "notes" in df.columns: # this came from the database
logger.info("Ignoring notes column in dataframe")
df.drop(columns="notes", inplace=True)
col = df.columns[0] # first column has database copyright notice
df = df[~df[col].str.contains("database", case=False)]
if "scenario" in df.columns and "model" not in df.columns:
# model and scenario are jammed together in RCP data
scen = df["scenario"]
df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip())
df.loc[:, "scenario"] = scen.apply(
lambda s: "-".join(s.split("-")[1:]).strip()
)
return df


def _format_from_legacy_database(df):
"""Process data from legacy databases (SSP and earlier)"""

logger.info("Ignoring notes column in `data`")
df.drop(columns="notes", inplace=True)
col = df.columns[0] # first column has database copyright notice
df = df[~df[col].str.contains("database", case=False)]
if "scenario" in df.columns and "model" not in df.columns:
# model and scenario are jammed together in RCP data
parts = df["scenario"].str.split("-", n=1, expand=True)
df = df.assign(model=parts[0].str.strip(), scenario=parts[1].str.strip())

# reset the index if meaningful entries are included there
if not list(df.index.names) == [None]:
df.reset_index(inplace=True)
return df


def _intuit_column_groups(df, index):
"""Check and categorise columns in dataframe"""

# check that there is no column in the timeseries data with reserved names
conflict_cols = [i for i in df.columns if i in ILLEGAL_COLS]
Expand All @@ -276,7 +271,6 @@ def convert_r_columns(c):

# check whether data in wide format (standard IAMC) or long format (`value` column)
if "value" in df.columns:

# check if time column is given as `year` (int) or `time` (datetime)
if "year" in df.columns and "time" not in df.columns:
time_col = "year"
Expand All @@ -289,19 +283,8 @@ def convert_r_columns(c):
for c in df.columns
if c not in index + REQUIRED_COLS + [time_col, "value"]
]

# replace missing units by an empty string for user-friendly filtering
df.loc[df.unit.isnull(), "unit"] = ""

_validate_complete_index(df[index + REQUIRED_COLS + extra_cols])

# cast to pd.Series
idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
df = df.set_index(idx_cols).value
df.dropna(inplace=True)

data_cols = []
else:

# if in wide format, check if columns are years (int) or datetime
cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
year_cols, time_cols, extra_cols = [], [], []
Expand All @@ -323,23 +306,67 @@ def convert_r_columns(c):

if year_cols and not time_cols:
time_col = "year"
melt_cols = sorted(year_cols)
data_cols = sorted(year_cols)
else:
time_col = "time"
melt_cols = sorted(year_cols) + sorted(time_cols)
if not melt_cols:
data_cols = sorted(year_cols) + sorted(time_cols)
if not data_cols:
raise ValueError("Missing time domain")

# replace missing units by an empty string for user-friendly filtering
df.loc[df.unit.isnull(), "unit"] = ""
return time_col, extra_cols, data_cols


def _format_data_to_series(df, index):
"""Convert a long or wide pandas dataframe to a series with the required columns"""

time_col, extra_cols, data_cols = _intuit_column_groups(df, index)

_validate_complete_index(df[index + REQUIRED_COLS + extra_cols])

idx_order = index + REQUIRED_COLS + [time_col] + extra_cols

if data_cols:
# wide format
df = (
df.set_index(index + REQUIRED_COLS + extra_cols)
.rename_axis(columns=time_col)
.stack(dropna=True)
.rename("value")
.reorder_levels(idx_order)
)
else:
# long format
df = df.set_index(idx_order)["value"].dropna()

return df, time_col, extra_cols


def format_data(df, index, **kwargs):
"""Convert a pandas.Dataframe or pandas.Series to the required format"""

if isinstance(df, pd.Series):
if not df.name:
df = df.rename("value")
df = df.reset_index()
elif not list(df.index.names) == [None]:
# reset the index if meaningful entries are included there
df = df.reset_index()

df = _convert_r_columns(df)

if kwargs:
df = _knead_data(df, **kwargs)

# cast all columns names to lower case
df.rename(columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True)

if "notes" in df.columns: # this came from a legacy database (SSP or earlier)
df = _format_from_legacy_database(df)

_validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
# replace missing units by an empty string for user-friendly filtering
df = df.assign(unit=df["unit"].fillna(""))

# cast to long format, set
df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
df = df.stack(dropna=True)
df.name = "value"
df.index.names = df.index.names[:-1] + [time_col]
df, time_col, extra_cols = _format_data_to_series(df, index)

# cast value column to numeric
try:
Expand Down

0 comments on commit 128fb19

Please sign in to comment.