From 5a1635632d908a5df7cc4791b7ea07f3f3fd5950 Mon Sep 17 00:00:00 2001 From: Rantaharju Jarno Date: Mon, 21 Oct 2024 09:28:50 +0300 Subject: [PATCH] Add a utility for selecting schema columns and feature columns --- niimpy/preprocessing/application.py | 6 +++++- niimpy/preprocessing/audio.py | 26 +++++++++++++------------- niimpy/preprocessing/battery.py | 8 ++++++++ niimpy/preprocessing/communication.py | 10 ++++++++++ niimpy/preprocessing/location.py | 3 +++ niimpy/preprocessing/screen.py | 9 +++++++++ niimpy/preprocessing/tracker.py | 8 +++++--- niimpy/preprocessing/util.py | 6 ++++++ 8 files changed, 59 insertions(+), 17 deletions(-) diff --git a/niimpy/preprocessing/application.py b/niimpy/preprocessing/application.py index 804c5516..bc6b7022 100644 --- a/niimpy/preprocessing/application.py +++ b/niimpy/preprocessing/application.py @@ -392,7 +392,9 @@ def app_count(df, bat=None, screen=None, config=None): result = util.group_data(df2, "app_group")["app_group"].resample(**config["resample_args"], include_groups=False).count() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) result = util.reset_groups(result, "app_group") + result = util.select_columns(result, ["app_group", "count"]) return result + return None @@ -505,7 +507,9 @@ def resample_group(group): df2.set_index("datetime", inplace=True) result = util.group_data(df2, "app_group")["duration"].resample(**config["resample_args"], include_groups=False).sum() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) - return util.reset_groups(result, "app_group") + df2 = util.reset_groups(result, "app_group") + df2 = util.select_columns(df2, ["app_group", "duration"]) + return df2 return None diff --git a/niimpy/preprocessing/audio.py b/niimpy/preprocessing/audio.py index a4209593..a0518f3f 100755 --- a/niimpy/preprocessing/audio.py +++ b/niimpy/preprocessing/audio.py @@ -42,7 +42,7 @@ def audio_count_silent(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).sum() result = result.to_frame(name='audio_count_silent') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_count_silent"]) return result return None @@ -87,7 +87,7 @@ def audio_count_speech(df_u, config=None): result = util.group_data(df_s)[col_name].resample(**config["resample_args"]).sum() result = result.to_frame(name='audio_count_speech') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_count_speech"]) return result return None @@ -129,7 +129,7 @@ def audio_count_loud(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).count() result = result.to_frame(name='audio_count_loud') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_count_loud"]) return result return None @@ -167,7 +167,7 @@ def audio_min_freq(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).min() result = result.to_frame(name='audio_min_freq') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_min_freq"]) return result return None @@ -205,7 +205,7 @@ def audio_max_freq(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).max() result = result.to_frame(name='audio_max_freq') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_max_freq"]) return result return None @@ -243,7 +243,7 @@ def audio_mean_freq(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).mean() result = result.to_frame(name='audio_mean_freq') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_mean_freq"]) return result return None @@ -281,7 +281,7 @@ def audio_median_freq(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).median() result = result.to_frame(name='audio_median_freq') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_median_freq"]) return result return None @@ -319,7 +319,7 @@ def audio_std_freq(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).std() result = result.to_frame(name='audio_std_freq') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_std_freq"]) return result return None @@ -357,7 +357,7 @@ def audio_min_db(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).min() result = result.to_frame(name='audio_min_db') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_min_db"]) return result return None @@ -395,7 +395,7 @@ def audio_max_db(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).max() result = result.to_frame(name='audio_max_db') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_max_db"]) return result return None @@ -433,7 +433,7 @@ def audio_mean_db(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).mean() result = result.to_frame(name='audio_mean_db') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_mean_db"]) return result return None @@ -471,7 +471,7 @@ def audio_median_db(df_u, config): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).median() result = result.to_frame(name='audio_median_db') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_median_db"]) return result return None @@ -509,7 +509,7 @@ def audio_std_db(df_u, config=None): result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).std() result = result.to_frame(name='audio_std_db') result = util.reset_groups(result) - result.index.rename("datetime", inplace=True) + result = util.select_columns(result, ["audio_std_db"]) return result return None diff --git a/niimpy/preprocessing/battery.py b/niimpy/preprocessing/battery.py index 904bc596..14874913 100644 --- a/niimpy/preprocessing/battery.py +++ b/niimpy/preprocessing/battery.py @@ -62,6 +62,7 @@ def battery_mean_level(df, config=None): result = util.group_data(df)[col_name].resample(**config["resample_args"]).mean() result = result.to_frame(name='battery_mean_level') result = util.reset_groups(result) + result = util.select_columns(result, ["battery_mean_level"]) return result @@ -93,6 +94,7 @@ def battery_median_level(df, config=None): result = util.group_data(df)[col_name].resample(**config["resample_args"]).median() result = result.to_frame(name='battery_median_level') result = util.reset_groups(result) + result = util.select_columns(result, ["battery_median_level"]) return result @@ -124,6 +126,7 @@ def battery_std_level(df, config=None): result = util.group_data(df)[col_name].resample(**config["resample_args"]).std() result = result.to_frame(name='battery_std_level') result = util.reset_groups(result) + result = util.select_columns(result, ["battery_std_level"]) return result @@ -169,6 +172,7 @@ def calculate_shutdown(df): result = util.group_data(df).apply(calculate_shutdown) result = util.reset_groups(result) + result = util.select_columns(result, ["shutdown_time"]) return result @@ -212,6 +216,7 @@ def calculate_discharge(df): result = util.group_data(df).apply(calculate_discharge) result = util.reset_groups(result) + result = util.select_columns(result, ["battery_discharge"]) return result @@ -268,6 +273,7 @@ def count_alive(series): occurrences = occurrences.to_frame(name='occurrences') occurrences = util.reset_groups(occurrences) + occurrences = util.select_columns(occurrences, ["occurrences"]) return occurrences @@ -309,6 +315,7 @@ def calculate_gaps(df): result = util.group_data(df).apply(calculate_gaps, include_groups=False) result = util.reset_groups(result) + result = util.select_columns(result, ["battery_gap"]) return result @@ -344,6 +351,7 @@ def calculate_discharge(df): discharge = util.group_data(df).apply(calculate_discharge, include_groups=False) discharge = util.reset_groups(discharge) + discharge = util.select_columns(discharge, ["bdelta", "charge/discharge"]) return discharge diff --git a/niimpy/preprocessing/communication.py b/niimpy/preprocessing/communication.py index 804e8108..24e91339 100644 --- a/niimpy/preprocessing/communication.py +++ b/niimpy/preprocessing/communication.py @@ -107,6 +107,7 @@ def call_duration_total(df, config=None): result = pd.concat([outgoing, incoming, missed], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_duration_total", "incoming_duration_total", "missed_duration_total"]) return result @@ -159,6 +160,7 @@ def call_duration_mean(df, config=None): result = pd.concat([outgoing, incoming, missed], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_duration_mean", "incoming_duration_mean", "missed_duration_mean"]) return result @@ -213,6 +215,7 @@ def call_duration_median(df, config=None): result = pd.concat([outgoing, incoming, missed], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_duration_median", "incoming_duration_median", "missed_duration_median"]) return result @@ -266,6 +269,7 @@ def call_duration_std(df, config=None): result = pd.concat([outgoing, incoming, missed], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_duration_std", "incoming_duration_std", "missed_duration_std"]) return result @@ -316,6 +320,7 @@ def call_count(df, config=None): result = pd.concat([outgoing, incoming, missed], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_count", "incoming_count", "missed_count"]) return result @@ -363,6 +368,7 @@ def call_outgoing_incoming_ratio(df, config=None): df2.fillna(0, inplace=True) result = df2.to_frame(name='outgoing_incoming_ratio') result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_incoming_ratio"]) return result @@ -406,6 +412,7 @@ def call_distribution(df, config=None): include_groups=False ) df = util.reset_groups(df) + df = util.select_columns(df, ["distribution"]) return df @@ -455,6 +462,7 @@ def message_count(df, config=None): result = pd.concat([outgoing, incoming], axis=1) result.fillna(0, inplace=True) result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_count", "incoming_count"]) return result return pd.DataFrame() @@ -503,6 +511,7 @@ def message_outgoing_incoming_ratio(df, config=None): df2.fillna(0, inplace=True) result = df2.to_frame(name='outgoing_incoming_ratio') result = util.reset_groups(result) + result = util.select_columns(result, ["outgoing_incoming_ratio"]) return result @@ -550,6 +559,7 @@ def message_distribution(df, config=None): include_groups=False ) df = util.reset_groups(df) + df = util.select_columns(df, ["distribution"]) return df diff --git a/niimpy/preprocessing/location.py b/niimpy/preprocessing/location.py index cf33d2ac..2ec7249f 100644 --- a/niimpy/preprocessing/location.py +++ b/niimpy/preprocessing/location.py @@ -290,6 +290,7 @@ def compute_features(df): result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) result = util.reset_groups(result) + result = util.select_columns(result, ["n_significant_places"]) return result @@ -422,6 +423,7 @@ def compute_features(df): result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) result = util.reset_groups(result) + result = util.select_columns(result, ["n_sps", "n_static", "n_moving", "n_rare", "n_home", "max_dist_home", "n_transitions", "n_top1", "n_top2", "n_top3", "n_top4", "n_top5", "entropy", "normalized_entropy"]) return result @@ -488,6 +490,7 @@ def compute_features(df): result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) result = util.reset_groups(result) + result = util.select_columns(result, ["dist_total", "n_bins", "speed_average", "speed_variance", "speed_max", "variance", "log_variance"]) return result ALL_FEATURES = [globals()[name] for name in globals() diff --git a/niimpy/preprocessing/screen.py b/niimpy/preprocessing/screen.py index c8f6f758..8b94e506 100755 --- a/niimpy/preprocessing/screen.py +++ b/niimpy/preprocessing/screen.py @@ -206,6 +206,7 @@ def screen_off(df, bat=None, config=None): df = df[id_columns + ["screen_status"]] df.rename(columns={"screen_status":"screen_off"}, inplace=True) df = util.reset_groups(df) + df = util.select_columns(df, ["screen_status"]) return df @@ -254,6 +255,7 @@ def screen_count(df, bat=None, config=None): use = use.to_frame(name='screen_use_count') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_count", "screen_off_count", "screen_use_count"]) return result @@ -303,6 +305,7 @@ def screen_duration(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationtotal') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationtotal", "screen_off_durationtotal", "screen_use_durationtotal"]) return result @@ -352,6 +355,7 @@ def screen_duration_min(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationminimum') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationminimum", "screen_off_durationminimum", "screen_use_durationminimum"]) return result @@ -401,6 +405,7 @@ def screen_duration_max(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationmaximum') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationmaximum", "screen_off_durationmaximum", "screen_use_durationmaximum"]) return result @@ -451,6 +456,7 @@ def screen_duration_mean(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationmean') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationmean", "screen_off_durationmean", "screen_use_durationmean"]) return result @@ -500,6 +506,7 @@ def screen_duration_median(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationmedian') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationmedian", "screen_off_durationmedian", "screen_use_durationmedian"]) return result @@ -549,6 +556,7 @@ def screen_duration_std(df, bat=None, config=None): use = use.to_frame(name='screen_use_durationstd') result = pd.concat([on, off, use], axis=1) result = util.reset_groups(result) + result = util.select_columns(result, ["screen_on_durationstd", "screen_off_durationstd", "screen_use_durationstd"]) return result @@ -588,6 +596,7 @@ def screen_first_unlock(df, bat=None, config=None): result = util.group_data(df2[df2.on==1])["time"].resample(rule='1D').min() result = result.to_frame(name="first_unlock") result = util.reset_groups(result) + result = util.select_columns(result, ["first_unlock"]) return result diff --git a/niimpy/preprocessing/tracker.py b/niimpy/preprocessing/tracker.py index 5063a59d..7b427c7a 100644 --- a/niimpy/preprocessing/tracker.py +++ b/niimpy/preprocessing/tracker.py @@ -73,6 +73,9 @@ def step_summary(df, config=None): summary_df['max_sum_step'] = util.group_data(df)['daily_sum'].max() summary_df = util.reset_groups(summary_df) + summary_df = util.select_columns(summary_df, + ["median_sum_step", "avg_sum_step", "std_sum_step", "min_sum_step", "max_sum_step"] + ) return summary_df @@ -135,9 +138,8 @@ def tracker_step_distribution(steps_df, config=None): # Divide hourly steps by daily sum to get the distribution steps['step_distribution'] = steps[steps_column] / steps['step_sum'] - # Set timestamp index - df = df.set_index("time") - + # Set index and select columns + steps = util.select_columns(steps, ["step_distribution", "step_sum"]) return steps[["step_distribution", "step_sum"]] diff --git a/niimpy/preprocessing/util.py b/niimpy/preprocessing/util.py index 71f9c239..dbfc3983 100644 --- a/niimpy/preprocessing/util.py +++ b/niimpy/preprocessing/util.py @@ -185,6 +185,12 @@ def identifier_columns(df, id_columns = ["user", "device", "group"]): return columns +def select_columns(df, columns, id_columns = ["user", "device", "group"]): + """ Select Niimpy identifier columns and listed feature columns """ + columns = identifier_columns(df, id_columns + columns) + return df[columns] + + def group_data(df, additional_columns=None, id_columns=["user", "device", "group"]): """ Group the dataframe by Niimpy standard user identifier columns present in the dataframe. The columns are 'user', 'device', and 'group'. An addional