Skip to content

Commit

Permalink
Update datetime feature creation and make it more overfitting robust
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasmeissnercrm committed Oct 24, 2024
1 parent 3e96a42 commit 73bae9b
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 136 deletions.
70 changes: 52 additions & 18 deletions bluecast/preprocessing/datetime_features.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
"""
Module for extracting date parts from datetime columns.
Cyclic transformations are not implemented as they aren't a good match for tree models.
"""

import logging
from typing import List, Optional, Union

import numpy as np
import pandas as pd


def date_converter(
df: pd.DataFrame,
date_columns: Optional[List[Union[str, int, float]]],
date_parts: List[str],
date_parts: Optional[List[str]],
) -> pd.DataFrame:
"""
Takes in a df and loops through datetime columns to and extracts the date parts month, day, dayofweek
and hour and adds them as additional columns.
Takes in a df and loops through datetime columns to extract date parts and adds them as additional columns.
Additionally, creates cyclic features for time components if there is more than one unique value for the time unit.
:param date_columns: List of datetime columns.
:param df: Dataframe to be processed.
:param date_parts: List of date parts to be extracted.
Expand All @@ -28,20 +27,55 @@ def date_converter(
return df

if not date_parts:
date_parts = ["year", "_week_of_year", "month", "day", "dayofweek", "hour"]
date_parts = ["year", "week_of_year", "month", "day", "dayofweek", "hour"]

date_part_periods = {
"month": 12,
"week_of_year": 52,
"day": 31,
"dayofweek": 7,
"hour": 24,
}

for c in date_columns:
if "year" in date_parts:
df[str(c) + "_year"] = df[c].dt.year.astype(float)
if "month" in date_parts:
df[str(c) + "_month"] = df[c].dt.month.astype(float)
if "week_of_year" in date_parts:
df[str(c) + "_week_of_year"] = df[c].dt.isocalendar().week.astype(float)
if "day" in date_parts:
df[str(c) + "_day"] = df[c].dt.day.astype(float)
if "dayofweek" in date_parts:
df[str(c) + "_dayofweek"] = df[c].dt.dayofweek.astype(float)
if "hour" in date_parts:
df[str(c) + "_hour"] = df[c].dt.hour.astype(float)
# create dummy variable such that the next part does not fail
date_part_values = pd.Series([0] * len(df))
for date_part in date_parts:
if date_part == "year" and df[c].dt.year.astype(float).nunique() > 1:
df[str(c) + "_year"] = df[c].dt.year.astype(float)

Check warning on line 45 in bluecast/preprocessing/datetime_features.py

View check run for this annotation

Codecov / codecov/patch

bluecast/preprocessing/datetime_features.py#L45

Added line #L45 was not covered by tests
elif date_part == "month" and df[c].dt.month.astype(float).nunique() > 1:
date_part_values = df[c].dt.month.astype(float)
df[str(c) + "_month"] = date_part_values
elif (
date_part == "week_of_year"
and df[c].dt.isocalendar().week.astype(float).nunique() > 1
):
date_part_values = df[c].dt.isocalendar().week.astype(float)
df[str(c) + "_week_of_year"] = date_part_values
elif date_part == "day" and df[c].dt.day.astype(float).nunique() > 1:
date_part_values = df[c].dt.day.astype(float)
df[str(c) + "_day"] = date_part_values
elif (
date_part == "dayofweek"
and df[c].dt.dayofweek.astype(float).nunique() > 1
):
date_part_values = df[c].dt.dayofweek.astype(float)
df[str(c) + "_dayofweek"] = date_part_values
elif date_part == "hour" and df[c].dt.hour.astype(float).nunique() > 1:
date_part_values = df[c].dt.hour.astype(float)
df[str(c) + "_hour"] = date_part_values
else:
pass

# For date parts with a defined period, create cyclic features if there is more than one unique value
if date_part in date_part_periods and date_part_values.nunique() > 1:
period = date_part_periods[date_part]
df[str(c) + "_" + date_part + "_sin"] = np.sin(
2 * np.pi * date_part_values / period
)
df[str(c) + "_" + date_part + "_cos"] = np.cos(
2 * np.pi * date_part_values / period
)
# Drop the original date column
df = df.drop(c, axis=1)
return df
119 changes: 95 additions & 24 deletions bluecast/tests/test_datetime_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from bluecast.preprocessing.datetime_features import date_converter

Expand All @@ -16,47 +18,116 @@ def sample_dataframe():


def test_date_converter(sample_dataframe):
# Define the expected result after applying date_converter
expected_result = pd.DataFrame(
{
"datetime_col_year": [2021, 2021],
"datetime_col_week_of_year": [53, 7], # "isocalendar" week starts from 1
"datetime_col_month": [1, 2],
"datetime_col_day": [1, 15],
"datetime_col_dayofweek": [4, 0],
"datetime_col_hour": [10, 15],
"other_col": [1, 2],
}
)

# Call the date_converter function with the sample dataframe
result = date_converter(
sample_dataframe,
["datetime_col"],
date_parts=["year", "week_of_year", "month", "day", "dayofweek", "hour"],
)

# Create the expected result after applying date_converter
expected_data = {
"other_col": [1, 2],
}

# Extract date parts
date_col = sample_dataframe["datetime_col"]
year = date_col.dt.year.astype(float)
week_of_year = date_col.dt.isocalendar().week.astype(float)
month = date_col.dt.month.astype(float)
day = date_col.dt.day.astype(float)
dayofweek = date_col.dt.dayofweek.astype(float)
hour = date_col.dt.hour.astype(float)

# Add date parts to expected data if there are more than one unique values
if year.nunique() > 1:
expected_data["datetime_col_year"] = year

Check warning on line 44 in bluecast/tests/test_datetime_features.py

View check run for this annotation

Codecov / codecov/patch

bluecast/tests/test_datetime_features.py#L44

Added line #L44 was not covered by tests
if week_of_year.nunique() > 1:
expected_data["datetime_col_week_of_year"] = week_of_year
if month.nunique() > 1:
expected_data["datetime_col_month"] = month
# Add cyclic features for month
expected_data["datetime_col_month_sin"] = np.sin(2 * np.pi * month / 12)
expected_data["datetime_col_month_cos"] = np.cos(2 * np.pi * month / 12)
if week_of_year.nunique() > 1:
expected_data["datetime_col_week_of_year"] = week_of_year
expected_data["datetime_col_week_of_year_sin"] = np.sin(
2 * np.pi * week_of_year / 52
)
expected_data["datetime_col_week_of_year_cos"] = np.cos(
2 * np.pi * week_of_year / 52
)

if day.nunique() > 1:
expected_data["datetime_col_day"] = day
# Add cyclic features for day
expected_data["datetime_col_day_sin"] = np.sin(2 * np.pi * day / 31)
expected_data["datetime_col_day_cos"] = np.cos(2 * np.pi * day / 31)
if dayofweek.nunique() > 1:
expected_data["datetime_col_dayofweek"] = dayofweek
# Add cyclic features for dayofweek
expected_data["datetime_col_dayofweek_sin"] = np.sin(2 * np.pi * dayofweek / 7)
expected_data["datetime_col_dayofweek_cos"] = np.cos(2 * np.pi * dayofweek / 7)
if hour.nunique() > 1:
expected_data["datetime_col_hour"] = hour
# Add cyclic features for hour
expected_data["datetime_col_hour_sin"] = np.sin(2 * np.pi * hour / 24)
expected_data["datetime_col_hour_cos"] = np.cos(2 * np.pi * hour / 24)

# Create expected DataFrame
expected_result = pd.DataFrame(expected_data)

# Reorder columns to match the result DataFrame
expected_result = expected_result[result.columns]

# Assert that the result matches the expected result
pd.testing.assert_frame_equal(
result,
expected_result,
assert_frame_equal(
result.reset_index(drop=True),
expected_result.reset_index(drop=True),
check_like=True,
check_column_type=False,
check_dtype=False,
)

# Call the date_converter function with the sample dataframe
result = date_converter(
# Test with date_parts=None (which defaults to all parts)
result_with_default_parts = date_converter(
sample_dataframe,
["datetime_col"],
date_parts=None,
)

# Assert that the result matches the expected result
pd.testing.assert_frame_equal(
result,
expected_result,
# The expected_result remains the same
assert_frame_equal(
result_with_default_parts.reset_index(drop=True),
expected_result.reset_index(drop=True),
check_like=True,
check_dtype=False,
)


def test_date_converter_single_unique_value():
# Sample data where all dates have the same month
data = {
"datetime_col": pd.to_datetime(["2021-01-01 10:30:00", "2021-01-15 15:45:00"]),
"other_col": [1, 2],
}
df = pd.DataFrame(data)

result = date_converter(
df,
["datetime_col"],
date_parts=["month"],
)

# Since there's only one unique month, no 'datetime_col_month' column should be added
expected_result = pd.DataFrame(
{
"other_col": [1, 2],
}
)

assert_frame_equal(
result.reset_index(drop=True),
expected_result.reset_index(drop=True),
check_like=True,
check_column_type=False,
check_dtype=False,
)
Binary file modified dist/bluecast-1.6.3-py3-none-any.whl
Binary file not shown.
Binary file modified dist/bluecast-1.6.3.tar.gz
Binary file not shown.
Loading

0 comments on commit 73bae9b

Please sign in to comment.