Skip to content

Commit

Permalink
Move direct references to niimpy schema columns into util
Browse files Browse the repository at this point in the history
This should clarify what is happening in features and simplify handling the "group" column
  • Loading branch information
rantahar committed Oct 15, 2024
1 parent 3620267 commit ea6eac4
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 49 deletions.
31 changes: 15 additions & 16 deletions niimpy/preprocessing/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
from niimpy.preprocessing import util


group_by_columns = set(["user", "device", "app_group"])


MAP_APP = {
"CrossCycle": "sports",
"Runtastic": "sports",
Expand Down Expand Up @@ -384,17 +381,17 @@ def app_count(df, bat=None, screen=None, config=None):
df2.sort_values(by=["user", "device", "datetime"], inplace=True)
df2.fillna({"app_group": "off"}, inplace=True)

df2 = df2[["user", "device", "datetime", "app_group", "application_name"]]
keep_columns = list(set(["user", "device", "group"]) & set(df.columns))
df2 = df2[keep_columns+["datetime", "app_group", "application_name"]]

df2.dropna(inplace=True)

if len(df2) > 0:
df2["datetime"] = pd.to_datetime(df2["datetime"])
df2.set_index("datetime", inplace=True)
result = util.group_data(df2, columns = group_by_columns)["app_group"].resample(**config["resample_args"], include_groups=False).count()
result = util.group_data(df2, "app_group")["app_group"].resample(**config["resample_args"], include_groups=False).count()
result = pd.DataFrame(result).rename(columns={"app_group": "count"})
result = util.reset_groups(result, columns = group_by_columns)

result = util.reset_groups(result, "app_group")
return result
return None

Expand Down Expand Up @@ -444,6 +441,8 @@ def app_duration(df, bat=None, screen=None, config=None):
config["resample_args"] = config.get("resample_args", {"rule":"30min"})
outlier_threshold = config.get("outlier_threshold", "10h")

niimpy_cols = list(set(["group", "user", "device"]) & set(df.columns))

df2 = classify_app(df, config)

# Insert missing data due to the screen being off or battery depleated
Expand All @@ -453,7 +452,7 @@ def app_duration(df, bat=None, screen=None, config=None):
screen.reset_index(inplace=True)
screen.set_index("index", inplace=True)
df2 = pd.concat([df2, screen])
df2.sort_values(by=["user", "device", "datetime"], inplace=True)
df2.sort_values(by=niimpy_cols + ["datetime"], inplace=True)
df2.fillna({"app_group": "off"}, inplace=True)

if screen.empty and not bat.empty:
Expand All @@ -463,10 +462,11 @@ def app_duration(df, bat=None, screen=None, config=None):
shutdown.reset_index(inplace=True)
shutdown.set_index("index", inplace=True)
df2 = pd.concat([df2, shutdown])
df2.sort_values(by=["user", "device", "datetime"], inplace=True)
df2.sort_values(by=niimpy_cols + ["datetime"], inplace=True)
df2.fillna({"app_group": "off"}, inplace=True)

df2 = df2[["user", "device", "time", "datetime", "app_group"]]
keep_columns = list(set(["group", "user", "device"]) & set(df.columns))
df2 = df2[keep_columns+["time", "datetime", "app_group"]]

# Fill in time gap between app foreground session
def resample_group(group):
Expand All @@ -486,6 +486,7 @@ def resample_group(group):
# Apply resampling to each group
df2 = util.group_data(df2).apply(resample_group, include_groups=False)
df2 = util.reset_groups(df2)
print(df2.shape)

df2["duration"] = np.nan
df2["duration"] = df2["datetime"].diff()
Expand All @@ -502,9 +503,9 @@ def resample_group(group):
if len(df2) > 0:
df2["datetime"] = pd.to_datetime(df2["datetime"])
df2.set_index("datetime", inplace=True)
result = util.group_data(df2, columns = group_by_columns)["duration"].resample(**config["resample_args"], include_groups=False).sum()
result = util.group_data(df2, "app_group")["duration"].resample(**config["resample_args"], include_groups=False).sum()
result = pd.DataFrame(result).rename(columns={"app_group": "count"})
return util.reset_groups(result, columns = group_by_columns)
return util.reset_groups(result, "app_group")

return None

Expand Down Expand Up @@ -547,13 +548,11 @@ def extract_features_app(df, bat=None, screen=None, features=None):

computed_features = []
for feature, feature_arg in features.items():
print(f"computing {feature}...")
computed_feature = feature(df, bat, screen, feature_arg)
index_by = list(group_by_columns & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature, "app_group")
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)
# index the result only by the original index (datetime)
computed_features = util.reset_groups(computed_features, columns = group_by_columns)
computed_features = util.reset_groups(computed_features, "app_group")
return computed_features
6 changes: 2 additions & 4 deletions niimpy/preprocessing/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from niimpy.preprocessing import util

group_by_columns = set(["user", "device"])
group_by_columns = set(["user", "device", "group"])


def audio_count_silent(df_u, config=None):
Expand Down Expand Up @@ -549,10 +549,8 @@ def extract_features_audio(df, features=None):

computed_features = []
for feature, feature_arg in features.items():
print(f'computing {feature}...')
computed_feature = feature(df, feature_arg)
index_by = list(group_by_columns & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature)
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)
Expand Down
6 changes: 2 additions & 4 deletions niimpy/preprocessing/battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from niimpy.preprocessing import util

group_by_columns = set(["user", "device"])


def shutdown_info(df, config=None):
""" Returns a pandas DataFrame with battery information for the timestamps when the phone
Expand Down Expand Up @@ -35,6 +33,7 @@ def shutdown_info(df, config=None):
shutdown = df[df[col_name].between(-3, 0, inclusive="neither")]
return shutdown


def battery_mean_level(df, config=None):
""" This function returns the mean battery level within the specified timeframe.
If there is no specified timeframe, the function sets a 30 min default time window.
Expand Down Expand Up @@ -468,8 +467,7 @@ def extract_features_battery(df, features=None):
for features, kwargs in features.items():
print(features, kwargs)
computed_feature = features(df, kwargs)
index_by = list(group_by_columns & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature)
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)
Expand Down
2 changes: 1 addition & 1 deletion niimpy/preprocessing/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from niimpy.preprocessing import util

group_by_columns = set(["user", "device"])
group_by_columns = set(["user", "device", "group"])


def _distribution(df, col_name = None, time_interval="d", bin_interval="h"):
Expand Down
2 changes: 1 addition & 1 deletion niimpy/preprocessing/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

default_freq = "1ME"

group_by_columns = set(["user", "device"])
group_by_columns = set(["user", "device", "group"])


def distance_matrix(lats, lons):
Expand Down
23 changes: 11 additions & 12 deletions niimpy/preprocessing/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from niimpy.preprocessing import battery as b
from niimpy.preprocessing import util

group_by_columns = ["user", "device"]


def util_screen(df, bat=None, config=None):
""" This function is a helper function for all other screen preprocessing.
Expand Down Expand Up @@ -38,6 +36,7 @@ def util_screen(df, bat=None, config=None):
assert isinstance(config, dict), "config is not a dictionary"

col_name = config.get("screen_column_name", "screen_status")
id_columns = util.identifier_columns(df)

df[col_name]=pd.to_numeric(df[col_name]) #convert to numeric in case it is not

Expand All @@ -49,11 +48,11 @@ def util_screen(df, bat=None, config=None):
if not shutdown.empty:
df = pd.concat([df, shutdown])
df.fillna(0, inplace=True)
df = df[["user","device","time",col_name]]
df = df[id_columns + [col_name]]

#Sort the dataframe
df.sort_index(inplace=True)
df.sort_values(by=["user","device"], inplace=True)
df.sort_values(by=id_columns, inplace=True)

#Detect missing data points
df['missing']=0
Expand Down Expand Up @@ -100,10 +99,11 @@ def event_classification_screen(df, config=None):
assert isinstance(config, dict), "config is not a dictionary"

col_name = config.get("screen_column_name", "screen_status")
id_columns = util.identifier_columns(df)

#Classify the event
df.sort_index(inplace=True)
df.sort_values(by=["user","device"], inplace=True)
df.sort_values(by=id_columns, inplace=True)
col_as_str = df[col_name].astype(int).astype(str)
next_as_str = col_as_str.shift(-1).fillna("0")
df['next'] = col_as_str + next_as_str
Expand Down Expand Up @@ -160,17 +160,16 @@ def duration_util_screen(df):
df['duration'] = df['duration'].shift(-1)

#Discard transitions between subjects
index_name = df.index.name
df = df.groupby(["user", "device"]).apply(lambda x: x.iloc[:-1], include_groups=False)
df.reset_index(["user", "device"], inplace=True)
df = util.group_data(df).apply(lambda x: x.iloc[:-1], include_groups=False)
df = util.reset_groups(df)

#Discard any datapoints whose duration in “ON” and "IN USE" states are
#longer than 10 hours becaus they may be artifacts
thr = pd.Timedelta('10 hours')
df = df[~((df.on==1) & (df.duration>thr))]
df = df[~((df.use==1) & (df.duration>thr))]
df["duration"] = df["duration"].dt.total_seconds()

return df

def screen_off(df, bat=None, config=None):
Expand Down Expand Up @@ -199,11 +198,12 @@ def screen_off(df, bat=None, config=None):
if config is None:
config = {}
assert isinstance(config, dict), "config is not a dictionary"
id_columns = util.identifier_columns(df)

df = util_screen(df, bat, config)
df = df[df.screen_status == 0] #Select only those OFF events when no missing data is present
df["screen_status"] = 1
df = df[["user","device","screen_status"]]
df = df[id_columns + ["screen_status"]]
df.rename(columns={"screen_status":"screen_off"}, inplace=True)
df = util.reset_groups(df)
return df
Expand Down Expand Up @@ -627,8 +627,7 @@ def extract_features_screen(df, bat=None, features=None):
computed_features = []
for feature, feature_arg in features.items():
computed_feature = feature(df, bat, feature_arg)
index_by = list(set(group_by_columns) & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature)
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)
Expand Down
2 changes: 1 addition & 1 deletion niimpy/preprocessing/survey.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
'PSS10_9' : PSS_ANSWER_MAP,
'PSS10_10' : PSS_ANSWER_MAP}

group_by_columns = set(["user", "device"])
group_by_columns = set(["user", "device", "group"])


def clean_survey_column_names(df):
Expand Down
2 changes: 1 addition & 1 deletion niimpy/preprocessing/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def step_summary(df, config=None):

# Calculate sum of steps for each date
df['daily_sum'] = util.group_data( df,
columns = ['day', 'month'] + group_by_columns
['day', 'month']
)[value_col].transform('sum')

# Under the assumption that a user cannot have zero steps per day, we remove rows where daily_sum are zero
Expand Down
45 changes: 38 additions & 7 deletions niimpy/preprocessing/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,19 +177,50 @@ def format_column_names(df):
df.rename(columns=column_map, inplace=True)


def group_data(df, columns=["user", "device"]):
""" Group the dataframe by a standard set of columns listed in
group_by_columns."""
columns = list(set(columns) & set(df.columns))
def identifier_columns(df, id_columns = ["user", "device", "group"]):
""" build a list of standard Niimpy identifier columns in the
dataframe.
"""
columns = list(set(id_columns) & set(df.columns))
return columns


def group_data(df, additional_columns=None, id_columns=["user", "device", "group"]):
""" Group the dataframe by Niimpy standard user identifier columns present in
the dataframe. The columns are 'user', 'device', and 'group'. An addional
column may be added and used for grouping.
"""
if type(additional_columns) is str:
additional_columns = [additional_columns]
elif additional_columns is None:
additional_columns = []
columns = identifier_columns(df, id_columns + additional_columns)
return df.groupby(columns)


def reset_groups(df, columns = set(["user", "device"])):
""" Reset the grouping, keeping only the original index columns. """
columns = list(set(columns) & set(df.index.names))
def reset_groups(df, additional_columns=None, id_columns = ["user", "device", "group"]):
""" Reset id columns and optional addional columns in the dataframe index. """
if type(additional_columns) is str:
additional_columns = [additional_columns]
elif additional_columns is None:
additional_columns = []
columns = list(set(id_columns + additional_columns) & set(df.index.names))
return df.reset_index(columns)


def set_conserved_index(df, additional_columns=None, id_columns = ["user", "device", "group"]):
""" Set standard id columns as index. This allows concatenating dataframes
with different measurements.
"""
if type(additional_columns) is str:
additional_columns = [additional_columns]
elif additional_columns is None:
additional_columns = []
index_by = list(set(id_columns + additional_columns) & set(df.columns))
df = df.set_index(index_by, append=True)
return df


def set_encoding(df, to_encoding = 'utf-8', from_encoding = 'iso-8859-1'):
""" Recode the dataframe to a different encoding. This is useful when
the encoding in a data file is set incorrectly and utf characters are
Expand Down
4 changes: 4 additions & 0 deletions tests/preprocessing/test_apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

def test_app_features():
# TEST 1
data["group"] = "group1"
screen["group"] = "group1"
battery["group"] = "group1"
test = app.extract_features_app(data, battery, screen, features=None)

user_comm = test[(test["user"] == "dvWdLQesv21a") & (test["app_group"] == "comm")]
Expand All @@ -37,6 +40,7 @@ def test_app_features():
user_comm = test[(test["user"] == "dvWdLQesv21a") & (test["app_group"] == "comm")]
user_work = test[(test["user"] == "dvWdLQesv21a") & (test["app_group"] == "work")]

#assert user_comm.loc["2019-08-05 20:00:00+03:00"]["group"] == "group1"
assert user_comm.loc["2019-08-05 20:00:00+03:00"]["count"] == 3
assert user_work.loc["2019-08-06 04:00:00+03:00"]["count"] == 2
assert user_comm.loc["2019-08-05 20:00:00+03:00"]["duration"] == 3569.00
Expand Down
4 changes: 2 additions & 2 deletions tests/preprocessing/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
data = niimpy.read_csv(config.MULTIUSER_AWARE_AUDIO_PATH, tz='Europe/Helsinki')

def test_audio_features():

data["group"] = "group1"
test = audio.extract_features_audio(data)

test_user1 = test[test["user"] == "jd9INuQ5BBlW"]
Expand All @@ -27,7 +27,7 @@ def test_audio_features():
assert test_user1.loc[pd.Timestamp("2020-01-09 06:00:00", tz='Europe/Helsinki')]["audio_max_db"] == 75
assert test_user1.loc[pd.Timestamp("2020-01-09 06:00:00", tz='Europe/Helsinki')]["audio_mean_db"] == 75
assert test_user1.loc[pd.Timestamp("2020-01-09 06:00:00", tz='Europe/Helsinki')]["audio_median_db"] == 75

assert test_user1.loc["2020-01-09 10:30:00"]["group"] == "group1"

test_user2 = test[test["user"] == "iGyXetHE3S8u"]
assert test_user2["audio_count_silent"].sum() == 3
Expand Down

0 comments on commit ea6eac4

Please sign in to comment.