Skip to content

Commit

Permalink
Implement compatibility with pandas 2.0 (#739)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuppmann authored Apr 14, 2023
1 parent a87e64e commit 449b77c
Show file tree
Hide file tree
Showing 12 changed files with 82 additions and 549 deletions.
2 changes: 2 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Next Release

- [#738](https://github.com/IAMconsortium/pyam/pull/738) Ensure compatibility with **pandas v2.0**

# Release v1.8.0

## Highlights
Expand Down
426 changes: 21 additions & 405 deletions docs/tutorials/quantiles.ipynb

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyam/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ def sankey(df, mapping):
mapping, orient="index", columns=["source", "target"]
).merge(df._data, how="left", left_index=True, right_on="variable")
label_mapping = dict(
[(label, i) for i, label in enumerate(set(_df["source"].append(_df["target"])))]
[
(label, i)
for i, label in enumerate(set(pd.concat([_df["source"], _df["target"]])))
]
)
_df.replace(label_mapping, inplace=True)
region = get_index_levels(_df, "region")[0]
Expand Down
2 changes: 1 addition & 1 deletion pyam/iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def query(self, default_only=True, meta=True, **kwargs):
if islistable(meta):
# always merge 'version' (even if not requested explicitly)
# 'run_id' is required to determine `_args`, dropped later
_meta = _meta[set(meta).union(["version", "run_id"])]
_meta = _meta[list(set(meta).union(["version", "run_id"]))]
else:
_meta = self._query_index(default_only=default_only).set_index(META_IDX)

Expand Down
18 changes: 9 additions & 9 deletions pyam/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,11 @@ def add(self, data, header, row=None, subheader=None):
else (levels + [[row]], [[0]] * (self.idx_depth + 1))
)
_stats_f.index = pd.MultiIndex(levels=lvls, codes=lbls)
_stats = _stats_f if _stats is None else _stats.append(_stats_f)
_stats = _stats_f if _stats is None else pd.concat([_stats, _stats_f])

# add header
_stats = pd.concat([_stats], keys=[header], names=[""], axis=1)
_stats.index.names = [None] * len(_stats.index.names)
subheader = _stats.columns.get_level_values(1).unique()
self._add_to_header(header, subheader)

Expand Down Expand Up @@ -272,15 +273,14 @@ def format_rows(
legend = "{} ({})".format(
center, "max, min" if fullrange is True else "interquartile range"
)
index = row.index.droplevel(2).drop_duplicates()
count_arg = dict(tuples=[("count", "")], names=[None, legend])

row_index = row.index.droplevel(2).drop_duplicates()
ret_index = pd.MultiIndex.from_tuples([("count", "")]).append(row_index)
ret_index.names = [None, legend]
else:
msg = "displaying multiple range formats simultaneously not supported"
raise NotImplementedError(msg)
raise ValueError("Use either fullrange or interquartile range.")

ret = pd.Series(
index=pd.MultiIndex.from_tuples(**count_arg).append(index), dtype=float
)
ret = pd.Series(index=ret_index, dtype=float)

row = row.sort_index()
center = "50%" if center == "median" else center
Expand All @@ -295,7 +295,7 @@ def format_rows(
upper, lower = ("max", "min") if fullrange is True else ("75%", "25%")

# format `describe()` columns to string output
for i in index:
for i in row_index:
x = row.loc[i]
_count = x["count"]
if np.isnan(_count) or _count == 0:
Expand Down
3 changes: 2 additions & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,8 @@ def format_data(df, index, **kwargs):
df = _format_from_legacy_database(df)

# replace missing units by an empty string for user-friendly filtering
df = df.assign(unit=df["unit"].fillna(""))
if "unit" in df.columns:
df = df.assign(unit=df["unit"].fillna(""))

df, time_col, extra_cols = _format_data_to_series(df, index)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ optional_io_formats =
tutorials =
pypandoc
nbformat
nbconvert
nbconvert >= 7.3
jupyter_client
ipykernel
docs =
Expand Down
6 changes: 3 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def test_init_df_with_float_cols_raises(test_pd_df):


def test_init_df_with_duplicates_raises(test_df):
_df = test_df.timeseries()
_df = _df.append(_df.iloc[0]).reset_index()
_df = test_df.timeseries().reset_index()
_df = pd.concat([_df, _df.iloc[0].to_frame().T])
match = "0 model_a scen_a World Primary Energy EJ/yr"
with pytest.raises(ValueError, match=match):
IamDataFrame(_df)
Expand Down Expand Up @@ -604,7 +604,7 @@ def test_interpolate_datetimes(test_df):
test_df.interpolate(some_date, inplace=True)
obs = test_df.filter(time=some_date).data["value"].reset_index(drop=True)
exp = pd.Series([3, 1.5, 4], name="value")
pd.testing.assert_series_equal(obs, exp, check_less_precise=True)
pd.testing.assert_series_equal(obs, exp, rtol=0.01)
# redo the interpolation and check that no duplicates are added
test_df.interpolate(some_date, inplace=True)
assert not test_df.filter()._data.index.duplicated().any()
Expand Down
13 changes: 8 additions & 5 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def test_filter_mixed_time_domain(test_df_mixed, arg_year, arg_time):
assert obs.time_col == "year"
assert obs.time_domain == "year"
assert obs.year == [2005]
pdt.assert_index_equal(obs.time, pd.Int64Index([2005], name="time"))
pdt.assert_index_equal(obs.time, pd.Index([2005], name="time"))
assert obs.time.dtype == "int64"


def test_filter_time_domain_raises(test_df_year):
Expand Down Expand Up @@ -106,10 +107,12 @@ def test_filter_day(test_df, test_day):

def test_filter_with_numpy_64_date_vals(test_df):
dates = test_df[test_df.time_col].unique()
key = "year" if test_df.time_col == "year" else "time"
res_0 = test_df.filter(**{key: dates[0]})
res = test_df.filter(**{key: dates})
assert np.equal(res_0.data[res_0.time_col].values, dates[0]).all()
res_0 = test_df.filter(**{test_df.time_col: dates[0]})
res = test_df.filter(**{test_df.time_col: dates})
if test_df.time_col == "year":
assert res_0.data[res_0.time_col].values[0] == dates[0]
else:
assert res_0.data[res_0.time_col].values[0] == np.datetime64(dates[0])
assert res.equals(test_df)


Expand Down
2 changes: 1 addition & 1 deletion tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def stats_add(stats, plot_df):
# test describing as pd.DataFrame
primary = plot_df.filter(variable="Primary Energy", year=2005).timeseries()
stats.add(data=primary, header="primary")

# test describing as unamed pd.Series with `subheader` arg
coal = plot_df.filter(variable="Primary Energy|Coal").timeseries()[2010]
coal.name = None
Expand Down Expand Up @@ -48,7 +49,6 @@ def test_statistics(plot_df):
idx = pd.MultiIndex(
levels=[["category", "scen"], ["b", "a", "test"]],
codes=[[0, 0, 1], [0, 1, 2]],
names=["", ""],
)
cols = pd.MultiIndex(
levels=[["count", "primary", "coal"], ["", 2005]],
Expand Down
2 changes: 1 addition & 1 deletion tests/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_subannual_df(date1, date2):
@pytest.mark.parametrize(
"time, domain, index",
[
(TEST_YEARS, "year", pd.Int64Index([2005, 2010], name="time")),
(TEST_YEARS, "year", pd.Index([2005, 2010], name="time")),
(TEST_DTS, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")),
(TEST_TIME_STR, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")),
(TEST_TIME_STR_HR, "datetime", pd.DatetimeIndex(TEST_TIME_STR_HR, name="time")),
Expand Down
150 changes: 29 additions & 121 deletions tests/test_tutorials.py
Original file line number Diff line number Diff line change
@@ -1,138 +1,46 @@
import io
import os
import subprocess
import sys
import pytest

from .conftest import here, IIASA_UNAVAILABLE

try:
import nbformat
except:
from nbconvert.preprocessors import ExecutePreprocessor
except ModuleNotFoundError:
pytest.skip(
"Missing Jupyter Notebook and related dependencies", allow_module_level=True
)

tut_path = os.path.join(here, "..", "docs", "tutorials")


# taken from the excellent example here:
# https://blog.thedataincubator.com/2016/06/testing-jupyter-notebooks/


def _notebook_run(path, kernel=None, timeout=60, capsys=None):
"""Execute a notebook via nbconvert and collect output.
:returns (parsed nb object, execution errors)
"""
major_version = sys.version_info[0]
dirname, __ = os.path.split(path)
os.chdir(dirname)
fname = os.path.join(here, "test.ipynb")
args = [
"jupyter",
"nbconvert",
"--to",
"notebook",
"--execute",
"--ExecutePreprocessor.timeout={}".format(timeout),
"--output",
fname,
path,
]
subprocess.check_call(args)

nb = nbformat.read(io.open(fname, encoding="utf-8"), nbformat.current_nbformat)

errors = [
output
for cell in nb.cells
if "outputs" in cell
for output in cell["outputs"]
if output.output_type == "error"
]

# removing files fails on CI (GitHub Actions) on Windows & py3.8
try:
os.remove(fname)
except PermissionError:
pass

return nb, errors


def test_pyam_first_steps(capsys):
fname = os.path.join(tut_path, "pyam_first_steps.ipynb")
nb, errors = _notebook_run(fname, capsys=capsys)
assert errors == []
assert os.path.exists(os.path.join(tut_path, "tutorial_export.xlsx"))

def has_log_output(cell):
return cell["cell_type"] == "code" and any(
"Running in a notebook" in output.get("text", "")
for output in cell["outputs"]
)

assert any(has_log_output(cell) for cell in nb["cells"])


def test_data_table_formats():
fname = os.path.join(tut_path, "data_table_formats.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_unit_conversion():
fname = os.path.join(tut_path, "unit_conversion.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_aggregating_downscaling_consistency():
fname = os.path.join(tut_path, "aggregating_downscaling_consistency.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_subannual_time_resolution():
fname = os.path.join(tut_path, "subannual_time_resolution.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_pyam_logo():
fname = os.path.join(tut_path, "pyam_logo.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
from .conftest import here, IIASA_UNAVAILABLE

nb_path = here.parent / "docs" / "tutorials"

def test_ipcc_colors():
fname = os.path.join(tut_path, "ipcc_colors.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []

def _run_notebook(file, timeout=30):
"""Execute a notebook file"""
with open(nb_path / f"{file}.ipynb") as f:
nb = nbformat.read(f, as_version=4)

def test_legends():
fname = os.path.join(tut_path, "legends.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
ep = ExecutePreprocessor(timeout=timeout)
ep.preprocess(nb, {"metadata": {"path": nb_path}})


def test_ops():
fname = os.path.join(tut_path, "algebraic_operations.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
@pytest.mark.parametrize(
"file",
[
"pyam_first_steps",
"data_table_formats",
"unit_conversion",
"aggregating_downscaling_consistency",
"subannual_time_resolution",
"pyam_logo",
"ipcc_colors",
"legends",
"algebraic_operations",
"aggregating_variables_and_plotting_with_negative_values",
],
)
def test_tutorial_notebook(file):
_run_notebook(file)


@pytest.mark.skipif(IIASA_UNAVAILABLE, reason="IIASA database API unavailable")
def test_iiasa_dbs():
fname = os.path.join(tut_path, "iiasa_dbs.ipynb")
nb, errors = _notebook_run(fname, timeout=600)
assert errors == []


def test_aggregating_variables_and_plotting_with_negative_values():
fname = os.path.join(
tut_path, "aggregating_variables_and_plotting_with_negative_values.ipynb"
)
nb, errors = _notebook_run(fname)
assert errors == []
def test_tutorial_iiasa_dbs():
_run_notebook("iiasa_dbs")

0 comments on commit 449b77c

Please sign in to comment.