Skip to content

Commit

Permalink
Add a utility for selecting schema columns and feature columns
Browse files Browse the repository at this point in the history
  • Loading branch information
rantahar committed Oct 21, 2024
1 parent 1ed8a94 commit 5a16356
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 17 deletions.
6 changes: 5 additions & 1 deletion niimpy/preprocessing/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,9 @@ def app_count(df, bat=None, screen=None, config=None):
result = util.group_data(df2, "app_group")["app_group"].resample(**config["resample_args"], include_groups=False).count()
result = pd.DataFrame(result).rename(columns={"app_group": "count"})
result = util.reset_groups(result, "app_group")
result = util.select_columns(result, ["app_group", "count"])
return result

return None


Expand Down Expand Up @@ -505,7 +507,9 @@ def resample_group(group):
df2.set_index("datetime", inplace=True)
result = util.group_data(df2, "app_group")["duration"].resample(**config["resample_args"], include_groups=False).sum()
result = pd.DataFrame(result).rename(columns={"app_group": "count"})
return util.reset_groups(result, "app_group")
df2 = util.reset_groups(result, "app_group")
df2 = util.select_columns(df2, ["app_group", "duration"])
return df2

return None

Expand Down
26 changes: 13 additions & 13 deletions niimpy/preprocessing/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def audio_count_silent(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).sum()
result = result.to_frame(name='audio_count_silent')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_count_silent"])
return result
return None

Expand Down Expand Up @@ -87,7 +87,7 @@ def audio_count_speech(df_u, config=None):
result = util.group_data(df_s)[col_name].resample(**config["resample_args"]).sum()
result = result.to_frame(name='audio_count_speech')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_count_speech"])
return result
return None

Expand Down Expand Up @@ -129,7 +129,7 @@ def audio_count_loud(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).count()
result = result.to_frame(name='audio_count_loud')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_count_loud"])
return result
return None

Expand Down Expand Up @@ -167,7 +167,7 @@ def audio_min_freq(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).min()
result = result.to_frame(name='audio_min_freq')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_min_freq"])
return result
return None

Expand Down Expand Up @@ -205,7 +205,7 @@ def audio_max_freq(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).max()
result = result.to_frame(name='audio_max_freq')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_max_freq"])
return result
return None

Expand Down Expand Up @@ -243,7 +243,7 @@ def audio_mean_freq(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).mean()
result = result.to_frame(name='audio_mean_freq')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_mean_freq"])
return result
return None

Expand Down Expand Up @@ -281,7 +281,7 @@ def audio_median_freq(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).median()
result = result.to_frame(name='audio_median_freq')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_median_freq"])
return result
return None

Expand Down Expand Up @@ -319,7 +319,7 @@ def audio_std_freq(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).std()
result = result.to_frame(name='audio_std_freq')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_std_freq"])
return result
return None

Expand Down Expand Up @@ -357,7 +357,7 @@ def audio_min_db(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).min()
result = result.to_frame(name='audio_min_db')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_min_db"])
return result
return None

Expand Down Expand Up @@ -395,7 +395,7 @@ def audio_max_db(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).max()
result = result.to_frame(name='audio_max_db')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_max_db"])
return result
return None

Expand Down Expand Up @@ -433,7 +433,7 @@ def audio_mean_db(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).mean()
result = result.to_frame(name='audio_mean_db')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_mean_db"])
return result
return None

Expand Down Expand Up @@ -471,7 +471,7 @@ def audio_median_db(df_u, config):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).median()
result = result.to_frame(name='audio_median_db')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_median_db"])
return result
return None

Expand Down Expand Up @@ -509,7 +509,7 @@ def audio_std_db(df_u, config=None):
result = util.group_data(df_u)[col_name].resample(**config["resample_args"]).std()
result = result.to_frame(name='audio_std_db')
result = util.reset_groups(result)
result.index.rename("datetime", inplace=True)
result = util.select_columns(result, ["audio_std_db"])
return result
return None

Expand Down
8 changes: 8 additions & 0 deletions niimpy/preprocessing/battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def battery_mean_level(df, config=None):
result = util.group_data(df)[col_name].resample(**config["resample_args"]).mean()
result = result.to_frame(name='battery_mean_level')
result = util.reset_groups(result)
result = util.select_columns(result, ["battery_mean_level"])
return result


Expand Down Expand Up @@ -93,6 +94,7 @@ def battery_median_level(df, config=None):
result = util.group_data(df)[col_name].resample(**config["resample_args"]).median()
result = result.to_frame(name='battery_median_level')
result = util.reset_groups(result)
result = util.select_columns(result, ["battery_median_level"])
return result


Expand Down Expand Up @@ -124,6 +126,7 @@ def battery_std_level(df, config=None):
result = util.group_data(df)[col_name].resample(**config["resample_args"]).std()
result = result.to_frame(name='battery_std_level')
result = util.reset_groups(result)
result = util.select_columns(result, ["battery_std_level"])
return result


Expand Down Expand Up @@ -169,6 +172,7 @@ def calculate_shutdown(df):

result = util.group_data(df).apply(calculate_shutdown)
result = util.reset_groups(result)
result = util.select_columns(result, ["shutdown_time"])
return result


Expand Down Expand Up @@ -212,6 +216,7 @@ def calculate_discharge(df):

result = util.group_data(df).apply(calculate_discharge)
result = util.reset_groups(result)
result = util.select_columns(result, ["battery_discharge"])
return result


Expand Down Expand Up @@ -268,6 +273,7 @@ def count_alive(series):
occurrences = occurrences.to_frame(name='occurrences')

occurrences = util.reset_groups(occurrences)
occurrences = util.select_columns(occurrences, ["occurrences"])
return occurrences


Expand Down Expand Up @@ -309,6 +315,7 @@ def calculate_gaps(df):

result = util.group_data(df).apply(calculate_gaps, include_groups=False)
result = util.reset_groups(result)
result = util.select_columns(result, ["battery_gap"])
return result


Expand Down Expand Up @@ -344,6 +351,7 @@ def calculate_discharge(df):

discharge = util.group_data(df).apply(calculate_discharge, include_groups=False)
discharge = util.reset_groups(discharge)
discharge = util.select_columns(discharge, ["bdelta", "charge/discharge"])
return discharge


Expand Down
10 changes: 10 additions & 0 deletions niimpy/preprocessing/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def call_duration_total(df, config=None):
result = pd.concat([outgoing, incoming, missed], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_duration_total", "incoming_duration_total", "missed_duration_total"])
return result


Expand Down Expand Up @@ -159,6 +160,7 @@ def call_duration_mean(df, config=None):
result = pd.concat([outgoing, incoming, missed], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_duration_mean", "incoming_duration_mean", "missed_duration_mean"])
return result


Expand Down Expand Up @@ -213,6 +215,7 @@ def call_duration_median(df, config=None):
result = pd.concat([outgoing, incoming, missed], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_duration_median", "incoming_duration_median", "missed_duration_median"])
return result


Expand Down Expand Up @@ -266,6 +269,7 @@ def call_duration_std(df, config=None):
result = pd.concat([outgoing, incoming, missed], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_duration_std", "incoming_duration_std", "missed_duration_std"])
return result


Expand Down Expand Up @@ -316,6 +320,7 @@ def call_count(df, config=None):
result = pd.concat([outgoing, incoming, missed], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_count", "incoming_count", "missed_count"])
return result


Expand Down Expand Up @@ -363,6 +368,7 @@ def call_outgoing_incoming_ratio(df, config=None):
df2.fillna(0, inplace=True)
result = df2.to_frame(name='outgoing_incoming_ratio')
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_incoming_ratio"])
return result


Expand Down Expand Up @@ -406,6 +412,7 @@ def call_distribution(df, config=None):
include_groups=False
)
df = util.reset_groups(df)
df = util.select_columns(df, ["distribution"])

return df

Expand Down Expand Up @@ -455,6 +462,7 @@ def message_count(df, config=None):
result = pd.concat([outgoing, incoming], axis=1)
result.fillna(0, inplace=True)
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_count", "incoming_count"])
return result
return pd.DataFrame()

Expand Down Expand Up @@ -503,6 +511,7 @@ def message_outgoing_incoming_ratio(df, config=None):
df2.fillna(0, inplace=True)
result = df2.to_frame(name='outgoing_incoming_ratio')
result = util.reset_groups(result)
result = util.select_columns(result, ["outgoing_incoming_ratio"])

return result

Expand Down Expand Up @@ -550,6 +559,7 @@ def message_distribution(df, config=None):
include_groups=False
)
df = util.reset_groups(df)
df = util.select_columns(df, ["distribution"])
return df


Expand Down
3 changes: 3 additions & 0 deletions niimpy/preprocessing/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def compute_features(df):

result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features)
result = util.reset_groups(result)
result = util.select_columns(result, ["n_significant_places"])
return result


Expand Down Expand Up @@ -422,6 +423,7 @@ def compute_features(df):

result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features)
result = util.reset_groups(result)
result = util.select_columns(result, ["n_sps", "n_static", "n_moving", "n_rare", "n_home", "max_dist_home", "n_transitions", "n_top1", "n_top2", "n_top3", "n_top4", "n_top5", "entropy", "normalized_entropy"])
return result


Expand Down Expand Up @@ -488,6 +490,7 @@ def compute_features(df):

result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features)
result = util.reset_groups(result)
result = util.select_columns(result, ["dist_total", "n_bins", "speed_average", "speed_variance", "speed_max", "variance", "log_variance"])
return result

ALL_FEATURES = [globals()[name] for name in globals()
Expand Down
9 changes: 9 additions & 0 deletions niimpy/preprocessing/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def screen_off(df, bat=None, config=None):
df = df[id_columns + ["screen_status"]]
df.rename(columns={"screen_status":"screen_off"}, inplace=True)
df = util.reset_groups(df)
df = util.select_columns(df, ["screen_status"])
return df


Expand Down Expand Up @@ -254,6 +255,7 @@ def screen_count(df, bat=None, config=None):
use = use.to_frame(name='screen_use_count')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_count", "screen_off_count", "screen_use_count"])
return result


Expand Down Expand Up @@ -303,6 +305,7 @@ def screen_duration(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationtotal')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationtotal", "screen_off_durationtotal", "screen_use_durationtotal"])
return result


Expand Down Expand Up @@ -352,6 +355,7 @@ def screen_duration_min(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationminimum')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationminimum", "screen_off_durationminimum", "screen_use_durationminimum"])
return result


Expand Down Expand Up @@ -401,6 +405,7 @@ def screen_duration_max(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationmaximum')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationmaximum", "screen_off_durationmaximum", "screen_use_durationmaximum"])
return result


Expand Down Expand Up @@ -451,6 +456,7 @@ def screen_duration_mean(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationmean')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationmean", "screen_off_durationmean", "screen_use_durationmean"])
return result


Expand Down Expand Up @@ -500,6 +506,7 @@ def screen_duration_median(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationmedian')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationmedian", "screen_off_durationmedian", "screen_use_durationmedian"])
return result


Expand Down Expand Up @@ -549,6 +556,7 @@ def screen_duration_std(df, bat=None, config=None):
use = use.to_frame(name='screen_use_durationstd')
result = pd.concat([on, off, use], axis=1)
result = util.reset_groups(result)
result = util.select_columns(result, ["screen_on_durationstd", "screen_off_durationstd", "screen_use_durationstd"])
return result


Expand Down Expand Up @@ -588,6 +596,7 @@ def screen_first_unlock(df, bat=None, config=None):
result = util.group_data(df2[df2.on==1])["time"].resample(rule='1D').min()
result = result.to_frame(name="first_unlock")
result = util.reset_groups(result)
result = util.select_columns(result, ["first_unlock"])
return result


Expand Down
8 changes: 5 additions & 3 deletions niimpy/preprocessing/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def step_summary(df, config=None):
summary_df['max_sum_step'] = util.group_data(df)['daily_sum'].max()

summary_df = util.reset_groups(summary_df)
summary_df = util.select_columns(summary_df,
["median_sum_step", "avg_sum_step", "std_sum_step", "min_sum_step", "max_sum_step"]
)
return summary_df


Expand Down Expand Up @@ -135,9 +138,8 @@ def tracker_step_distribution(steps_df, config=None):
# Divide hourly steps by daily sum to get the distribution
steps['step_distribution'] = steps[steps_column] / steps['step_sum']

# Set timestamp index
df = df.set_index("time")

# Set index and select columns
steps = util.select_columns(steps, ["step_distribution", "step_sum"])
return steps[["step_distribution", "step_sum"]]


Expand Down
Loading

0 comments on commit 5a16356

Please sign in to comment.