From 1fad56ce75abfd33226485c80d19bd8de0880b50 Mon Sep 17 00:00:00 2001 From: Tristan Gerrish Date: Wed, 26 Jul 2023 11:27:53 +0100 Subject: [PATCH] simplified month_hour data binning --- .../src/ladybugtools_toolkit/__init__.py | 1 + .../external_comfort/utci/postprocess.py | 97 +++--------- .../src/ladybugtools_toolkit/helpers.py | 142 +++++++++++------- .../Python/tests/test_helpers.py | 53 ++++--- 4 files changed, 140 insertions(+), 153 deletions(-) diff --git a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/__init__.py b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/__init__.py index bf737605..bb84c0d1 100644 --- a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/__init__.py +++ b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/__init__.py @@ -2,6 +2,7 @@ import os import sys from pathlib import Path + import matplotlib.pyplot as plt # override "HOME" in case IT has set this to something other than default diff --git a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/external_comfort/utci/postprocess.py b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/external_comfort/utci/postprocess.py index 4098f6ea..50417b6a 100644 --- a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/external_comfort/utci/postprocess.py +++ b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/external_comfort/utci/postprocess.py @@ -1,21 +1,23 @@ -import calendar import warnings from typing import Any, List, Tuple, Union import numpy as np import pandas as pd from ladybug.datacollection import HourlyContinuousCollection -from ladybug.datatype.temperature import \ - UniversalThermalClimateIndex as LB_UniversalThermalClimateIndex +from ladybug.datatype.temperature import ( + UniversalThermalClimateIndex as LB_UniversalThermalClimateIndex, +) from ladybug.epw import EPW from ladybug_comfort.collection.solarcal import OutdoorSolarCal from ladybug_comfort.collection.utci import UTCI from scipy.interpolate import interp1d, interp2d from ...categorical.categories import UTCI_DEFAULT_CATEGORIES, Categorical -from ...helpers import evaporative_cooling_effect -from ...ladybug_extension.datacollection import (collection_from_series, - collection_to_series) +from ...helpers import evaporative_cooling_effect, month_hour_binned_series +from ...ladybug_extension.datacollection import ( + collection_from_series, + collection_to_series, +) from ...plot.utilities import contrasting_color @@ -489,23 +491,18 @@ def feasible_utci_category_limits( return df -def month_time_binned_table( +def month_hour_binned( utci_data: Union[pd.Series, HourlyContinuousCollection], - month_bins: Tuple[Tuple[int]] = ((3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 1, 2)), - hour_bins: Tuple[Tuple[int]] = ( - (5, 6, 7, 8, 9, 10, 11), - (12, 13, 14, 15, 16), - (17, 18, 19, 20), - (21, 22, 23, 0, 1, 2, 3, 4), - ), + month_bins: Tuple[Tuple[int]] = None, + hour_bins: Tuple[Tuple[int]] = None, utci_categories: Categorical = UTCI_DEFAULT_CATEGORIES, color_result: bool = False, - month_labels: List[str] = None, - hour_labels: List[str] = None, + month_labels: Tuple[str] = None, + hour_labels: Tuple[str] = None, agg: str = "mean", **kwargs, ) -> pd.DataFrame: - """Create a table with monthly time binned UTCI data. + """Create a table with monthly hour binned UTCI data. Args: utci_data (Union[pd.Series, HourlyContinuousCollection]): @@ -546,64 +543,14 @@ def month_time_binned_table( if isinstance(utci_data, HourlyContinuousCollection): utci_data = collection_to_series(utci_data) - # check that utci_data is annual hourly data - if len(utci_data) < 8760: - raise ValueError( - "utci_data must be hourly data over the course of at least a year (8760+ hours)." - ) - - # check for continuity of time periods, and overlaps overnight/year - flat_hours = [item for sublist in hour_bins for item in sublist] - flat_months = [item for sublist in month_bins for item in sublist] - - if (max(flat_hours) != 23) or min(flat_hours) != 0: - raise ValueError("hour_bins hours must be in the range 0-23") - if (max(flat_months) != 12) or min(flat_months) != 1: - raise ValueError("month_bins hours must be in the range 1-12") - # cehck for duplicates - if len(set(flat_hours)) != len(flat_hours): - raise ValueError("hour_bins hours must not contain duplicates") - if len(set(flat_months)) != len(flat_months): - raise ValueError("month_bins hours must not contain duplicates") - if (set(flat_hours) != set(list(range(24)))) or (len(set(flat_hours)) != 24): - raise ValueError("Input hour_bins does not contain all hours of the day") - if (set(flat_months) != set(list(range(1, 13, 1)))) or ( - len(set(flat_months)) != 12 - ): - raise ValueError("Input month_bins does not contain all months of the year") - - # create index/column labels - if month_labels: - if len(month_labels) != len(month_bins): - raise ValueError("month_labels must be the same length as month_bins") - col_labels = month_labels - else: - col_labels = [] - for months in month_bins: - if len(months) == 1: - col_labels.append(calendar.month_abbr[months[0]]) - else: - col_labels.append( - f"{calendar.month_abbr[months[0]]} to {calendar.month_abbr[months[-1]]}" - ) - if hour_labels: - if len(hour_labels) != len(hour_bins): - raise ValueError("time_labels must be the same length as hour_bins") - row_labels = hour_labels - else: - row_labels = [f"{i[0]:02d}:00 ≤ t < {i[-1] + 1:02d}:00" for i in hour_bins] - - # create indexing bins - values = [] - for months in month_bins: - month_mask = utci_data.index.month.isin(months) - inner_values = [] - for hours in hour_bins: - mask = utci_data.index.hour.isin(hours) & month_mask - avg = utci_data.loc[mask].agg(agg) - inner_values.append(avg) - values.append(inner_values) - df = pd.DataFrame(values, index=col_labels, columns=row_labels).T + df = month_hour_binned_series( + series=utci_data, + month_bins=month_bins, + hour_bins=hour_bins, + month_labels=month_labels, + hour_labels=hour_labels, + agg=agg, + ) if color_result: warnings.warn( diff --git a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/helpers.py b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/helpers.py index 7610bc14..7ea2e318 100644 --- a/LadybugTools_Engine/Python/src/ladybugtools_toolkit/helpers.py +++ b/LadybugTools_Engine/Python/src/ladybugtools_toolkit/helpers.py @@ -1,5 +1,6 @@ from __future__ import annotations +import calendar import contextlib import copy import io @@ -8,7 +9,6 @@ import re import urllib.request import warnings -from calendar import month_abbr from datetime import datetime, timedelta from enum import Enum, auto from pathlib import Path @@ -47,7 +47,7 @@ def ZeroPadPercentFormatter(x: float) -> str: return f"{x:5.0%}" -def default_time_analysis_periods() -> List[AnalysisPeriod]: +def default_hour_analysis_periods() -> List[AnalysisPeriod]: """A set of generic Analysis Period objects, spanning times of day.""" f = io.StringIO() with contextlib.redirect_stdout(f): @@ -80,7 +80,7 @@ def default_combined_analysis_periods() -> List[AnalysisPeriod]: f = io.StringIO() with contextlib.redirect_stdout(f): aps = [] - for ap_time in default_time_analysis_periods(): + for ap_time in default_hour_analysis_periods(): for ap_month in default_month_analysis_periods(): aps.append( AnalysisPeriod( @@ -103,7 +103,7 @@ def default_analysis_periods() -> List[AnalysisPeriod]: AnalysisPeriod(), ] aps.extend(default_month_analysis_periods()) - aps.extend(default_time_analysis_periods()) + aps.extend(default_hour_analysis_periods()) aps.extend(default_combined_analysis_periods()) return aps @@ -1619,12 +1619,12 @@ def remove_leap_days( return pd_object[~mask] -def time_binned_dataframe( +def month_hour_binned_series( series: pd.Series, - hour_bins: List[List[int]] = None, - month_bins: List[List[int]] = None, - hour_bin_labels: List[List[int]] = None, - month_bin_labels: List[List[int]] = None, + month_bins: Tuple[Tuple[int]] = None, + hour_bins: Tuple[Tuple[int]] = None, + month_labels: Tuple[str] = None, + hour_labels: Tuple[str] = None, agg: str = "mean", ) -> pd.DataFrame: """Bin a series by hour and month. @@ -1632,13 +1632,13 @@ def time_binned_dataframe( Args: series (pd.Series): A series with a datetime index. - hour_bins (List[List[int]], optional): - A list of lists of hours to bin by. Defaults to None which bins into 24 discrete hours. - month_bins (List[List[int]], optional): - A list of lists of months to bin by. Defaults to None which bins into 12 discrete months. - hour_bin_labels (List[str], optional): + hour_bins (Tuple[Tuple[int]], optional): + A list of lists of hours to bin by. Defaults to None which bins into the default_time_analysis_periods(). + month_bins (Tuple[Tuple[int]], optional): + A list of lists of months to bin by. Defaults to None which bins into default_month_analysis_periods. + hour_labels (List[str], optional): A list of labels to use for the hour bins. Defaults to None which just lists the hours in each bin. - month_bin_labels (List[str], optional): + month_labels (List[str], optional): A list of labels to use for the month bins. Defaults to None which just lists the months in each bin. agg (str, optional): The aggregation method to use. Can be either "min", "mean", "median", "max" or "sum". Defaults to "mean". @@ -1666,52 +1666,82 @@ def time_binned_dataframe( # check that the series has at least 24-values per day if series.groupby(series.index.day_of_year).count().min() < 24: - raise ValueError("The series must have at least 24-values per day") + raise ValueError("The series must contain at least 24-values per day") - # add name to series if no name found - if series.name is None: - series.name = "series" - - # create generic bins if none are given by user - if hour_bins is None: - hour_bins = [[i] for i in range(24)] + # create generic month/hour sets for binning, from existing defaults if no input if month_bins is None: - month_bins = [[i] for i in range(1, 13)] - - # create generic bin labels if none are given by user - if hour_bin_labels is None: - hour_bin_labels = [", ".join([f"{j:02d}:00" for j in i]) for i in hour_bins] - if month_bin_labels is None: - month_bin_labels = [", ".join([month_abbr[j] for j in i]) for i in month_bins] + _months = np.arange(1, 13, 1) + month_bins = [] + for ap in default_month_analysis_periods(): + if ap.st_month == ap.end_month: + res = (ap.st_month,) + else: + length = ap.end_month - ap.st_month + res = tuple(np.roll(_months, -ap.st_month + 1)[: length + 1]) + month_bins.append(res) + month_bins = tuple(month_bins) + if hour_bins is None: + _hours = np.arange(0, 24) + hour_bins = [] + for ap in default_hour_analysis_periods(): + if ap.st_hour == ap.end_hour: + res = (ap.st_hour,) + else: + length = ap.end_hour - ap.st_hour + res = tuple(np.roll(_hours, -ap.st_hour)[: length + 1]) + hour_bins.append(res) + hour_bins = tuple(hour_bins) + + # check for contiguity of time periods + flat_hours = [item for sublist in hour_bins for item in sublist] + flat_months = [item for sublist in month_bins for item in sublist] + if (max(flat_hours) != 23) or min(flat_hours) != 0: + raise ValueError("hour_bins hours must be in the range 0-23") + if (max(flat_months) != 12) or min(flat_months) != 1: + raise ValueError("month_bins hours must be in the range 1-12") + # cehck for duplicates + if len(set(flat_hours)) != len(flat_hours): + raise ValueError("hour_bins hours must not contain duplicates") + if len(set(flat_months)) != len(flat_months): + raise ValueError("month_bins hours must not contain duplicates") + if (set(flat_hours) != set(list(range(24)))) or (len(set(flat_hours)) != 24): + raise ValueError("Input hour_bins does not contain all hours of the day") + if (set(flat_months) != set(list(range(1, 13, 1)))) or ( + len(set(flat_months)) != 12 + ): + raise ValueError("Input month_bins does not contain all months of the year") - # check that length of hour-bin-labels matches that of hour-bins - if len(hour_bin_labels) != len(hour_bins): - raise ValueError( - "Hour bin labels must be the same length as the number of hour bins." - ) - if len(month_bin_labels) != len(month_bins): - raise ValueError( - "Month bin labels must be the same length as the number of month bins." - ) + # create index/column labels + if month_labels: + if len(month_labels) != len(month_bins): + raise ValueError("month_labels must be the same length as month_bins") + col_labels = month_labels + else: + col_labels = [] + for months in month_bins: + if len(months) == 1: + col_labels.append(calendar.month_abbr[months[0]]) + else: + col_labels.append( + f"{calendar.month_abbr[months[0]]} to {calendar.month_abbr[months[-1]]}" + ) + if hour_labels: + if len(hour_labels) != len(hour_bins): + raise ValueError("hour_labels must be the same length as hour_bins") + row_labels = hour_labels + else: + row_labels = [f"{i[0]:02d}:00 ≤ t < {i[-1] + 1:02d}:00" for i in hour_bins] - # check that hour and month bins are valid - if len(set([item for sublist in hour_bins for item in sublist])) != 24: - raise ValueError("Hour bins must contain all hours [0-23]") - if len(set([item for sublist in month_bins for item in sublist])) != 12: - raise ValueError("Month bins must contain all months [1-12]") - - # convert series to dataframe with month/hour columns, and aggregate - df = series.to_frame() - df["hour"] = series.index.hour - df["month"] = series.index.month - a = [] + # create indexing bins + values = [] for months in month_bins: - b = [] + month_mask = series.index.month.isin(months) + inner_values = [] for hours in hour_bins: - b.append( - df[df.month.isin(months) & df.hour.isin(hours)][series.name].agg(agg) - ) - a.append(b) - df = pd.DataFrame(a, index=month_bin_labels, columns=hour_bin_labels).T + mask = series.index.hour.isin(hours) & month_mask + aggregated = series.loc[mask].agg(agg) + inner_values.append(aggregated) + values.append(inner_values) + df = pd.DataFrame(values, index=col_labels, columns=row_labels).T return df diff --git a/LadybugTools_Engine/Python/tests/test_helpers.py b/LadybugTools_Engine/Python/tests/test_helpers.py index b4ce2f35..17ed108e 100644 --- a/LadybugTools_Engine/Python/tests/test_helpers.py +++ b/LadybugTools_Engine/Python/tests/test_helpers.py @@ -13,10 +13,11 @@ decay_rate_smoother, default_analysis_periods, default_combined_analysis_periods, + default_hour_analysis_periods, default_month_analysis_periods, - default_time_analysis_periods, evaporative_cooling_effect, evaporative_cooling_effect_collection, + month_hour_binned_series, proximity_decay, radiation_at_height, remove_leap_days, @@ -24,7 +25,6 @@ sanitise_string, target_wind_speed_collection, temperature_at_height, - time_binned_dataframe, timedelta_tostring, weibull_pdf, wind_direction_average, @@ -223,9 +223,9 @@ def test_rolling_window(): assert rolling_window(array, window).tolist() == expected_output -def test_default_time_analysis_periods(): +def test_default_hour_analysis_periods(): """_""" - aps = default_time_analysis_periods() + aps = default_hour_analysis_periods() assert len(aps) == 4 assert isinstance(aps[0], AnalysisPeriod) assert isinstance(aps[1], AnalysisPeriod) @@ -408,30 +408,27 @@ def test_remove_leap_days(): assert len(s) == 728 -def test_time_binned_dataframe(): +def test_month_hour_binned_series(): """_""" s = pd.Series( index=pd.date_range(start="2017-01-01 00:00:00", freq="60T", periods=8760), data=range(8760), ) - # Test defaults - assert time_binned_dataframe(s).shape == (24, 12) - # test that the function returns a dataframe - assert isinstance(time_binned_dataframe(s), pd.DataFrame) + assert isinstance(month_hour_binned_series(s), pd.DataFrame) # test that the function raises an error if the series is not a time series with pytest.raises(ValueError): - time_binned_dataframe([1, 2, 3]) + month_hour_binned_series([1, 2, 3]) # test that the function raises an error if the series is empty with pytest.raises(ValueError): - time_binned_dataframe(pd.Series(dtype=float)) + month_hour_binned_series(pd.Series(dtype=float)) # test that the function raises an error if the series does not contain at least 12 months of data with pytest.raises(ValueError): - time_binned_dataframe( + month_hour_binned_series( pd.Series( index=pd.date_range( start="2017-01-01 00:00:00", freq="60T", periods=5000 @@ -442,7 +439,7 @@ def test_time_binned_dataframe(): # test that the function raises an error if the series does not have at least 24 values per day with pytest.raises(ValueError): - time_binned_dataframe( + month_hour_binned_series( pd.Series( index=pd.date_range( start="2017-01-01 00:00:00", freq="120T", periods=8760 * 3 @@ -453,26 +450,38 @@ def test_time_binned_dataframe(): # test that the function raises an error if the length of hour-bin-labels does not match that of hour-bins with pytest.raises(ValueError): - time_binned_dataframe(s, hour_bin_labels=["Morning", "Afternoon"]) + month_hour_binned_series(s, hour_labels=["Morning", "Afternoon"]) # test that the function raises an error if the length of month-bin-labels does not match that of month-bins with pytest.raises(ValueError): - time_binned_dataframe(s, month_bin_labels=["Q1", "Q2", "Q3"]) + month_hour_binned_series(s, month_labels=["Q1", "Q2", "Q3"]) # test that the function raises an error if hour bins do not contain all hours [0-23] with pytest.raises(ValueError): - time_binned_dataframe(s, hour_bins=[[0, 1, 2], [3, 4, 5]]) + month_hour_binned_series(s, hour_bins=[[0, 1, 2], [3, 4, 5]]) # test that the function raises an error if month bins do not contain all months [1-12] with pytest.raises(ValueError): - time_binned_dataframe(s, month_bins=[[1, 2, 3], [4, 5, 6]]) + month_hour_binned_series(s, month_bins=[[1, 2, 3], [4, 5, 6]]) - # test that the function returns a dataframe with the expected shape - df = time_binned_dataframe(s) - assert df.shape == (24, 12) + # test that the function raises an error if month bins overlap + with pytest.raises(ValueError): + month_hour_binned_series( + s, month_bins=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10, 11, 12]] + ) - # Test with custom bins - assert time_binned_dataframe( + # test that the function raises an error if hour bins overlap + with pytest.raises(ValueError): + month_hour_binned_series( + s, + hour_bins=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], + [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], + ], + ) + + # test that the function returns a dataframe with the expected shape + assert month_hour_binned_series( s, month_bins=[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], hour_bins=[