Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement compatibility with pandas 2.0 #739

Merged
merged 12 commits into from
Apr 14, 2023
2 changes: 2 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Next Release

- [#738](https://github.com/IAMconsortium/pyam/pull/738) Ensure compatibility with **pandas v2.0**

# Release v1.8.0

## Highlights
Expand Down
426 changes: 21 additions & 405 deletions docs/tutorials/quantiles.ipynb

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyam/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ def sankey(df, mapping):
mapping, orient="index", columns=["source", "target"]
).merge(df._data, how="left", left_index=True, right_on="variable")
label_mapping = dict(
[(label, i) for i, label in enumerate(set(_df["source"].append(_df["target"])))]
[
(label, i)
for i, label in enumerate(set(pd.concat([_df["source"], _df["target"]])))
]
)
_df.replace(label_mapping, inplace=True)
region = get_index_levels(_df, "region")[0]
Expand Down
2 changes: 1 addition & 1 deletion pyam/iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def query(self, default_only=True, meta=True, **kwargs):
if islistable(meta):
# always merge 'version' (even if not requested explicitly)
# 'run_id' is required to determine `_args`, dropped later
_meta = _meta[set(meta).union(["version", "run_id"])]
_meta = _meta[list(set(meta).union(["version", "run_id"]))]
else:
_meta = self._query_index(default_only=default_only).set_index(META_IDX)

Expand Down
18 changes: 9 additions & 9 deletions pyam/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,11 @@ def add(self, data, header, row=None, subheader=None):
else (levels + [[row]], [[0]] * (self.idx_depth + 1))
)
_stats_f.index = pd.MultiIndex(levels=lvls, codes=lbls)
_stats = _stats_f if _stats is None else _stats.append(_stats_f)
_stats = _stats_f if _stats is None else pd.concat([_stats, _stats_f])

# add header
_stats = pd.concat([_stats], keys=[header], names=[""], axis=1)
_stats.index.names = [None] * len(_stats.index.names)
subheader = _stats.columns.get_level_values(1).unique()
self._add_to_header(header, subheader)

Expand Down Expand Up @@ -272,15 +273,14 @@ def format_rows(
legend = "{} ({})".format(
center, "max, min" if fullrange is True else "interquartile range"
)
index = row.index.droplevel(2).drop_duplicates()
count_arg = dict(tuples=[("count", "")], names=[None, legend])

row_index = row.index.droplevel(2).drop_duplicates()
ret_index = pd.MultiIndex.from_tuples([("count", "")]).append(row_index)
ret_index.names = [None, legend]
else:
msg = "displaying multiple range formats simultaneously not supported"
raise NotImplementedError(msg)
raise ValueError("Use either fullrange or interquartile range.")

ret = pd.Series(
index=pd.MultiIndex.from_tuples(**count_arg).append(index), dtype=float
)
ret = pd.Series(index=ret_index, dtype=float)

row = row.sort_index()
center = "50%" if center == "median" else center
Expand All @@ -295,7 +295,7 @@ def format_rows(
upper, lower = ("max", "min") if fullrange is True else ("75%", "25%")

# format `describe()` columns to string output
for i in index:
for i in row_index:
x = row.loc[i]
_count = x["count"]
if np.isnan(_count) or _count == 0:
Expand Down
3 changes: 2 additions & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,8 @@ def format_data(df, index, **kwargs):
df = _format_from_legacy_database(df)

# replace missing units by an empty string for user-friendly filtering
df = df.assign(unit=df["unit"].fillna(""))
if "unit" in df.columns:
df = df.assign(unit=df["unit"].fillna(""))

df, time_col, extra_cols = _format_data_to_series(df, index)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ optional_io_formats =
tutorials =
pypandoc
nbformat
nbconvert
nbconvert >= 7.3
jupyter_client
ipykernel
docs =
Expand Down
6 changes: 3 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def test_init_df_with_float_cols_raises(test_pd_df):


def test_init_df_with_duplicates_raises(test_df):
_df = test_df.timeseries()
_df = _df.append(_df.iloc[0]).reset_index()
_df = test_df.timeseries().reset_index()
_df = pd.concat([_df, _df.iloc[0].to_frame().T])
match = "0 model_a scen_a World Primary Energy EJ/yr"
with pytest.raises(ValueError, match=match):
IamDataFrame(_df)
Expand Down Expand Up @@ -604,7 +604,7 @@ def test_interpolate_datetimes(test_df):
test_df.interpolate(some_date, inplace=True)
obs = test_df.filter(time=some_date).data["value"].reset_index(drop=True)
exp = pd.Series([3, 1.5, 4], name="value")
pd.testing.assert_series_equal(obs, exp, check_less_precise=True)
pd.testing.assert_series_equal(obs, exp, rtol=0.01)
# redo the interpolation and check that no duplicates are added
test_df.interpolate(some_date, inplace=True)
assert not test_df.filter()._data.index.duplicated().any()
Expand Down
13 changes: 8 additions & 5 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def test_filter_mixed_time_domain(test_df_mixed, arg_year, arg_time):
assert obs.time_col == "year"
assert obs.time_domain == "year"
assert obs.year == [2005]
pdt.assert_index_equal(obs.time, pd.Int64Index([2005], name="time"))
pdt.assert_index_equal(obs.time, pd.Index([2005], name="time"))
assert obs.time.dtype == "int64"


def test_filter_time_domain_raises(test_df_year):
Expand Down Expand Up @@ -106,10 +107,12 @@ def test_filter_day(test_df, test_day):

def test_filter_with_numpy_64_date_vals(test_df):
dates = test_df[test_df.time_col].unique()
key = "year" if test_df.time_col == "year" else "time"
res_0 = test_df.filter(**{key: dates[0]})
res = test_df.filter(**{key: dates})
assert np.equal(res_0.data[res_0.time_col].values, dates[0]).all()
res_0 = test_df.filter(**{test_df.time_col: dates[0]})
res = test_df.filter(**{test_df.time_col: dates})
if test_df.time_col == "year":
assert res_0.data[res_0.time_col].values[0] == dates[0]
else:
assert res_0.data[res_0.time_col].values[0] == np.datetime64(dates[0])
assert res.equals(test_df)


Expand Down
2 changes: 1 addition & 1 deletion tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def stats_add(stats, plot_df):
# test describing as pd.DataFrame
primary = plot_df.filter(variable="Primary Energy", year=2005).timeseries()
stats.add(data=primary, header="primary")

# test describing as unamed pd.Series with `subheader` arg
coal = plot_df.filter(variable="Primary Energy|Coal").timeseries()[2010]
coal.name = None
Expand Down Expand Up @@ -48,7 +49,6 @@ def test_statistics(plot_df):
idx = pd.MultiIndex(
levels=[["category", "scen"], ["b", "a", "test"]],
codes=[[0, 0, 1], [0, 1, 2]],
names=["", ""],
)
cols = pd.MultiIndex(
levels=[["count", "primary", "coal"], ["", 2005]],
Expand Down
2 changes: 1 addition & 1 deletion tests/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_subannual_df(date1, date2):
@pytest.mark.parametrize(
"time, domain, index",
[
(TEST_YEARS, "year", pd.Int64Index([2005, 2010], name="time")),
(TEST_YEARS, "year", pd.Index([2005, 2010], name="time")),
(TEST_DTS, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")),
(TEST_TIME_STR, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")),
(TEST_TIME_STR_HR, "datetime", pd.DatetimeIndex(TEST_TIME_STR_HR, name="time")),
Expand Down
150 changes: 29 additions & 121 deletions tests/test_tutorials.py
Original file line number Diff line number Diff line change
@@ -1,138 +1,46 @@
import io
import os
import subprocess
import sys
import pytest

from .conftest import here, IIASA_UNAVAILABLE

try:
import nbformat
except:
from nbconvert.preprocessors import ExecutePreprocessor
except ModuleNotFoundError:
pytest.skip(
"Missing Jupyter Notebook and related dependencies", allow_module_level=True
)

tut_path = os.path.join(here, "..", "docs", "tutorials")


# taken from the excellent example here:
# https://blog.thedataincubator.com/2016/06/testing-jupyter-notebooks/


def _notebook_run(path, kernel=None, timeout=60, capsys=None):
"""Execute a notebook via nbconvert and collect output.
:returns (parsed nb object, execution errors)
"""
major_version = sys.version_info[0]
dirname, __ = os.path.split(path)
os.chdir(dirname)
fname = os.path.join(here, "test.ipynb")
args = [
"jupyter",
"nbconvert",
"--to",
"notebook",
"--execute",
"--ExecutePreprocessor.timeout={}".format(timeout),
"--output",
fname,
path,
]
subprocess.check_call(args)

nb = nbformat.read(io.open(fname, encoding="utf-8"), nbformat.current_nbformat)

errors = [
output
for cell in nb.cells
if "outputs" in cell
for output in cell["outputs"]
if output.output_type == "error"
]

# removing files fails on CI (GitHub Actions) on Windows & py3.8
try:
os.remove(fname)
except PermissionError:
pass

return nb, errors


def test_pyam_first_steps(capsys):
fname = os.path.join(tut_path, "pyam_first_steps.ipynb")
nb, errors = _notebook_run(fname, capsys=capsys)
assert errors == []
assert os.path.exists(os.path.join(tut_path, "tutorial_export.xlsx"))

def has_log_output(cell):
return cell["cell_type"] == "code" and any(
"Running in a notebook" in output.get("text", "")
for output in cell["outputs"]
)

assert any(has_log_output(cell) for cell in nb["cells"])


def test_data_table_formats():
fname = os.path.join(tut_path, "data_table_formats.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_unit_conversion():
fname = os.path.join(tut_path, "unit_conversion.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_aggregating_downscaling_consistency():
fname = os.path.join(tut_path, "aggregating_downscaling_consistency.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_subannual_time_resolution():
fname = os.path.join(tut_path, "subannual_time_resolution.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []


def test_pyam_logo():
fname = os.path.join(tut_path, "pyam_logo.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
from .conftest import here, IIASA_UNAVAILABLE

nb_path = here.parent / "docs" / "tutorials"

def test_ipcc_colors():
fname = os.path.join(tut_path, "ipcc_colors.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []

def _run_notebook(file, timeout=30):
"""Execute a notebook file"""
with open(nb_path / f"{file}.ipynb") as f:
nb = nbformat.read(f, as_version=4)

def test_legends():
fname = os.path.join(tut_path, "legends.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
ep = ExecutePreprocessor(timeout=timeout)
ep.preprocess(nb, {"metadata": {"path": nb_path}})


def test_ops():
fname = os.path.join(tut_path, "algebraic_operations.ipynb")
nb, errors = _notebook_run(fname)
assert errors == []
@pytest.mark.parametrize(
"file",
[
"pyam_first_steps",
"data_table_formats",
"unit_conversion",
"aggregating_downscaling_consistency",
"subannual_time_resolution",
"pyam_logo",
"ipcc_colors",
"legends",
"algebraic_operations",
"aggregating_variables_and_plotting_with_negative_values",
],
)
def test_tutorial_notebook(file):
_run_notebook(file)


@pytest.mark.skipif(IIASA_UNAVAILABLE, reason="IIASA database API unavailable")
def test_iiasa_dbs():
fname = os.path.join(tut_path, "iiasa_dbs.ipynb")
nb, errors = _notebook_run(fname, timeout=600)
assert errors == []


def test_aggregating_variables_and_plotting_with_negative_values():
fname = os.path.join(
tut_path, "aggregating_variables_and_plotting_with_negative_values.ipynb"
)
nb, errors = _notebook_run(fname)
assert errors == []
def test_tutorial_iiasa_dbs():
_run_notebook("iiasa_dbs")