From 3ec111a6b89813bed635bc7206d1bec2fa808ba6 Mon Sep 17 00:00:00 2001 From: Peter Inglesby Date: Tue, 17 Oct 2023 14:17:23 +0100 Subject: [PATCH] Fix percentiles The percentile boundaries generated by np.arange did not round correctly when scaled up to integers. Specifically: >>> (np.arange(0.01, 0.1, 0.01) * 100).astype(int) array([1, 2, 3, 4, 5, 6, 6, 8, 9]) >>> np.arange(0.01, 0.1, 0.01)[6] 0.06999999999999999 We work around this by specifying the percentile boundaries explicitly. --- ebmdatalab/__init__.py | 2 +- ebmdatalab/charts.py | 8 ++++---- ebmdatalab/tests/test_charts.py | 21 +++++++++++++++------ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/ebmdatalab/__init__.py b/ebmdatalab/__init__.py index 061a262..94feb35 100644 --- a/ebmdatalab/__init__.py +++ b/ebmdatalab/__init__.py @@ -1,3 +1,3 @@ """Package for ebmdatalab jupyter notebook stuff """ -__version__ = "0.0.29" +__version__ = "0.0.30" diff --git a/ebmdatalab/charts.py b/ebmdatalab/charts.py index b818b45..d704a50 100644 --- a/ebmdatalab/charts.py +++ b/ebmdatalab/charts.py @@ -24,9 +24,9 @@ def add_percentiles(df, period_column=None, column=None, show_outer_percentiles= Adds `percentile` column. """ - deciles = np.arange(0.1, 1, 0.1) - bottom_percentiles = np.arange(0.01, 0.1, 0.01) - top_percentiles = np.arange(0.91, 1, 0.01) + deciles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + bottom_percentiles = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09] + top_percentiles = [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99] if show_outer_percentiles: quantiles = np.concatenate((deciles, bottom_percentiles, top_percentiles)) else: @@ -34,7 +34,7 @@ def add_percentiles(df, period_column=None, column=None, show_outer_percentiles= df = df.groupby(period_column)[column].quantile(quantiles).reset_index() df = df.rename(index=str, columns={"level_1": "percentile"}) # create integer range of percentiles - df["percentile"] = df["percentile"].apply(lambda x: int(x * 100)) + df["percentile"] = (df["percentile"] * 100).astype(int) return df diff --git a/ebmdatalab/tests/test_charts.py b/ebmdatalab/tests/test_charts.py index a2fc56f..c84f0fd 100644 --- a/ebmdatalab/tests/test_charts.py +++ b/ebmdatalab/tests/test_charts.py @@ -4,12 +4,21 @@ def test_add_percentiles(): - df = pd.DataFrame(np.random.rand(1000, 1), columns=["val"]) - months = pd.date_range("2018-01-01", periods=12, freq="M") - df["month"] = np.random.choice(months, len(df)) - df = charts.add_percentiles(df, period_column="month", column="val") - # This is a statistically-likely test, so might fail! - assert (df[df.percentile == 99].val > 0.75).all() + rows = [["2023-09-01", n * 5] for n in range(1001)] + df = pd.DataFrame(rows, columns=["month", "val"]) + dfp = ( + charts.add_percentiles(df, period_column="month", column="val") + .sort_values("percentile") + .reset_index(drop=True) + ) + percentiles = [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 20, 30, 40, 50, 60, 70, 80, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, + ] + expected_rows = [["2023-09-01", percentile, percentile * 50.0] for percentile in percentiles] + expected = pd.DataFrame(expected_rows, columns=["month", "percentile", "val"]) + pd.testing.assert_frame_equal(dfp, expected) def test_deciles_chart():