Skip to content

Commit

Permalink
Test that each feature conserves the "group" column
Browse files Browse the repository at this point in the history
  • Loading branch information
rantahar committed Oct 15, 2024
1 parent ea6eac4 commit 618d086
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 29 deletions.
4 changes: 0 additions & 4 deletions niimpy/preprocessing/battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,9 +471,5 @@ def extract_features_battery(df, features=None):
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)

if 'group' in df:
computed_features['group'] = df.groupby('user')['group'].first()

computed_features = util.reset_groups(computed_features)
return computed_features
3 changes: 1 addition & 2 deletions niimpy/preprocessing/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,7 @@ def extract_features_comms(df, features=None):
for feature, feature_arg in features.items():
print(f'computing {feature}...')
computed_feature = feature(df, feature_arg)
index_by = list(set(group_by_columns) & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature)
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)
Expand Down
9 changes: 1 addition & 8 deletions niimpy/preprocessing/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

default_freq = "1ME"

group_by_columns = set(["user", "device", "group"])


def distance_matrix(lats, lons):
"""Compute distance matrix using great-circle distance formula
Expand Down Expand Up @@ -533,14 +531,9 @@ def extract_features_location(df, features=None):
computed_features = []
for features, feature_arg in features.items():
computed_feature = features(df, feature_arg)
index_by = list(group_by_columns & set(computed_feature.columns))
computed_feature = computed_feature.set_index(index_by, append=True)
computed_feature = util.set_conserved_index(computed_feature)
computed_features.append(computed_feature)

computed_features = pd.concat(computed_features, axis=1)

if 'group' in df:
computed_features['group'] = df.groupby('user')['group'].first()

computed_features = util.reset_groups(computed_features)
return computed_features
8 changes: 6 additions & 2 deletions tests/preprocessing/test_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
})
df11['datetime'] = pd.to_datetime(df11['datetime'])
df11 = df11.set_index('datetime', drop=False)

df11["group"] = "group1"

def test_format_battery_data():
df = df11.copy()
Expand All @@ -34,7 +34,8 @@ def test_format_battery_data():
assert battery.loc[Timestamp('2019-01-17 09:21:26.036000+02:00'), 'battery_health'] == '2'
assert battery.loc[Timestamp('2019-01-17 09:48:59.438999808+02:00'), 'battery_status'] == '-2'
assert battery.loc[Timestamp('2019-01-17 09:57:11.275000064+02:00'), 'battery_adaptor'] == '1'

assert battery.loc[Timestamp('2019-01-17 09:57:11.275000064+02:00'), 'group'] =="group1"


def test_battery_occurrences():
df = df11.copy()
Expand All @@ -46,6 +47,7 @@ def test_battery_occurrences():
assert occurrences_user.loc[Timestamp('2019-01-17 10:00:00+02:00')]["occurrences"] == 1
occurrences_user = occurrences[occurrences["user"] == "Afxzi7oI0yyp"]
assert occurrences_user.loc[Timestamp('2019-01-17 09:30:00+02:00')]["occurrences"] == 3
assert occurrences_user.loc[Timestamp('2019-01-17 09:30:00+02:00'), 'group'] =="group1"


def test_battery_gaps():
Expand All @@ -58,6 +60,7 @@ def test_battery_gaps():
assert gaps_user.loc[Timestamp('2019-01-17 09:30:00+02:00')]["battery_gap"] == pd.Timedelta('0 days 00:04:26.149666560')
gaps_user = gaps[gaps["user"] == "lb983ODxEFUD"]
assert gaps_user.loc[Timestamp('2019-01-17 09:30:00+02:00')]["battery_gap"] == pd.Timedelta('0 days 00:01:00.453499904')
assert gaps_user.loc[Timestamp('2019-01-17 09:30:00+02:00'), 'group'] =="group1"


def test_battery_charge_discharge():
Expand All @@ -68,3 +71,4 @@ def test_battery_charge_discharge():
assert chdisch_user.loc[Timestamp('2019-01-17 10:30:00+02:00')]['bdelta'] == -2.
chdisch_user = chdisch[chdisch["user"] == "lb983ODxEFUD"]
assert chdisch_user.loc[Timestamp('2019-01-17 10:30:00+02:00')]['charge/discharge'] == -0.001050474788377773
assert chdisch_user.loc[Timestamp('2019-01-17 10:30:00+02:00'), 'group'] =="group1"
14 changes: 12 additions & 2 deletions tests/preprocessing/test_communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import pytest

import pandas as pd
import numpy as np
import zipfile

import niimpy
import niimpy.preprocessing.communication as comms
Expand All @@ -13,6 +11,7 @@

def test_audio_features():
data = niimpy.read_csv(config.MULTIUSER_AWARE_CALLS_PATH, tz='Europe/Helsinki')
data["group"] = "group1"
test = comms.extract_features_comms(data, features=None)

test_user = test[test["user"] == "jd9INuQ5BBlW"]
Expand Down Expand Up @@ -41,6 +40,7 @@ def test_audio_features():
assert test_user.loc[pd.Timestamp("2019-08-10 15:00:00", tz='Europe/Helsinki')]["missed_duration_median"] == 0
assert test_user.loc[pd.Timestamp("2019-08-10 15:00:00", tz='Europe/Helsinki')]["missed_duration_std"] == 0
assert test_user.loc[pd.Timestamp("2019-08-10 15:00:00", tz='Europe/Helsinki')]["outgoing_incoming_ratio"] == 2
assert test_user.loc[pd.Timestamp("2019-08-10 15:00:00", tz='Europe/Helsinki')]["group"] == "group1"

features ={comms.call_count:{"communication_column_name":"call_duration","resample_args":{"rule":"1D"}},
comms.call_outgoing_incoming_ratio:{"communication_column_name":"call_duration","resample_args":{"rule":"1D"}}}
Expand All @@ -52,6 +52,7 @@ def test_audio_features():
test_user = test[test["user"] == "iGyXetHE3S8u"]
assert test_user.loc[pd.Timestamp("2019-08-10", tz='Europe/Helsinki')]["outgoing_count"] == 2
assert test_user.loc[pd.Timestamp("2019-08-10", tz='Europe/Helsinki')]["outgoing_incoming_ratio"] == 2
assert test_user.loc[pd.Timestamp("2019-08-10", tz='Europe/Helsinki')]["group"] == "group1"

features ={comms.call_duration_total:{"audio_column_name":"double_frequency","resample_args":{"rule":"2h"}},
comms.call_duration_mean:{"audio_column_name":"double_frequency","resample_args":{"rule":"2h"}},
Expand All @@ -69,16 +70,19 @@ def test_audio_features():
assert test_user.loc[pd.Timestamp("2019-08-13 06:00:00", tz='Europe/Helsinki')]["incoming_duration_mean"] == 591
assert test_user.loc[pd.Timestamp("2019-08-13 06:00:00", tz='Europe/Helsinki')]["incoming_duration_median"] == 591
assert test_user.loc[pd.Timestamp("2019-08-13 06:00:00", tz='Europe/Helsinki')]["incoming_duration_std"] == 0
assert test_user.loc[pd.Timestamp("2019-08-13 06:00:00", tz='Europe/Helsinki')]["group"] == "group1"


def test_message_features():
data = niimpy.read_csv(config.MULTIUSER_AWARE_MESSAGES_PATH, tz='Europe/Helsinki')
data["group"] = "group1"
test = comms.extract_features_comms(data, features=None)

test_user = test[test["user"] == "jd9INuQ5BBlW"]
assert test_user.loc[pd.Timestamp("2020-01-09 02:30:00+02:00", tz='Europe/Helsinki')]["outgoing_count"] == 5
assert test_user.loc[pd.Timestamp("2020-01-09 02:30:00+02:00", tz='Europe/Helsinki')]["incoming_count"] == 5
assert test_user.loc[pd.Timestamp("2020-01-09 02:30:00+02:00", tz='Europe/Helsinki')]["outgoing_incoming_ratio"] == 1.0
assert test_user.loc[pd.Timestamp("2020-01-09 02:30:00+02:00", tz='Europe/Helsinki')]["group"] == "group1"


def test_message_features_with_gmail():
Expand All @@ -87,11 +91,13 @@ def test_message_features_with_gmail():
data = niimpy.reading.google_takeout.email_activity(
path, sentiment_batch_size = 2
)
data["group"] = "group1"

test = comms.extract_features_comms(data, features=None)
assert test.loc[pd.Timestamp("2023-12-15 12:30:00+00:00", tz='Europe/Helsinki')]["outgoing_count"] == 0
assert test.loc[pd.Timestamp("2023-12-15 12:30:00+00:00", tz='Europe/Helsinki')]["incoming_count"] == 2
assert test.loc[pd.Timestamp("2023-12-15 12:30:00+00:00", tz='Europe/Helsinki')]["outgoing_incoming_ratio"] == 0
assert test.loc[pd.Timestamp("2023-12-15 12:30:00+00:00", tz='Europe/Helsinki')]["group"] == "group1"


def test_message_features_with_google_chat(google_takeout_zipped):
Expand All @@ -101,14 +107,18 @@ def test_message_features_with_google_chat(google_takeout_zipped):
sentiment=False,
sentiment_batch_size = 2
)
data["group"] = "group1"

test = comms.extract_features_comms(data, features=None)
print(test.loc[pd.Timestamp("2024-01-30 13:00:00+00:00", tz='Europe/Helsinki')])
assert test.loc[pd.Timestamp("2024-01-30 13:00:00+00:00", tz='Europe/Helsinki')]["outgoing_count"] == 2
assert test.loc[pd.Timestamp("2024-01-30 13:00:00+00:00", tz='Europe/Helsinki')]["group"] == "group1"


def test_call_distribution():
data = niimpy.read_csv(config.MULTIUSER_AWARE_CALLS_PATH, tz='Europe/Helsinki')
data["group"] = "group1"
test = niimpy.preprocessing.communication.call_distribution(data)
test_user = test[test["user"] == "jd9INuQ5BBlW"]
assert test_user.loc[pd.Timestamp("2020-01-09 02:00:00", tz='Europe/Helsinki')]["distribution"] == pytest.approx(0.88888888)
assert test_user.loc[pd.Timestamp("2020-01-09 02:00:00", tz='Europe/Helsinki')]["group"] == "group1"
9 changes: 2 additions & 7 deletions tests/preprocessing/test_location.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import os

import numpy as np
import pandas as pd

from geopy.distance import distance

Expand All @@ -11,6 +8,7 @@

# read sample data
data = niimpy.read_csv(config.GPS_PATH, tz='et')
data["group"] = "group1"

def test_distance_matrix():

Expand All @@ -33,9 +31,6 @@ def test_distance_matrix():


def test_location_features():

assert data.shape[0] >= data.shape[0], "Number of rows should not increase"

# extract featuers
features = nilo.extract_features_location(data)

Expand All @@ -44,7 +39,6 @@ def test_location_features():

features_u1 = features[features["user"] == 'gps_u00']
features_u1 = features_u1.dropna().iloc[1]
print(features_u1)

assert features_u1['n_significant_places'] == 11.0
assert features_u1['n_sps'] == 11.0
Expand All @@ -68,5 +62,6 @@ def test_location_features():
assert np.abs(features_u1['speed_max'] - 33.25) < 0.1
assert np.abs(features_u1['variance'] - 0.237454) < 0.1
assert np.abs(features_u1['log_variance'] - -1.437781) < 0.1
assert features_u1['group'] == "group1"


11 changes: 7 additions & 4 deletions tests/preprocessing/test_screen.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import os

import numpy as np
import pandas as pd
import numpy as np

Expand All @@ -11,6 +8,8 @@
# read sample data
data = niimpy.read_csv(config.MULTIUSER_AWARE_SCREEN_PATH, tz='Europe/Helsinki')
bat = niimpy.read_csv(config.MULTIUSER_AWARE_BATTERY_PATH, tz='Europe/Helsinki')
data["group"] = "group1"
bat["group"] = "group1"

def test_audio_features():

Expand Down Expand Up @@ -40,6 +39,7 @@ def test_audio_features():
assert test_user.loc[time]["screen_use_durationmean"] < 47
assert test_user.loc[time]["screen_use_durationmedian"] < 47
assert test_user.loc[time]["screen_use_durationstd"] < 11
assert test_user.loc[time]["group"] == "group1"

time = pd.Timestamp("2019-08-08 22:30:00", tz='Europe/Helsinki')

Expand All @@ -65,6 +65,7 @@ def test_audio_features():
assert test_user2.loc[time]["screen_use_durationmean"] < 0.2
assert test_user2.loc[time]["screen_use_durationmedian"] < 0.2
assert test_user2.loc[time]["screen_use_durationstd"] < 0.1
assert test_user2.loc[time]["group"] == "group1"


features ={sc.screen_count:{"screen_column_name":"screen_status","resample_args":{"rule":"1D"}},
Expand All @@ -79,6 +80,7 @@ def test_audio_features():
assert test_user2.loc[pd.Timestamp("2019-08-08", tz='Europe/Helsinki')]["screen_use_count"] == 6
assert test_user2.loc[pd.Timestamp("2019-08-31", tz='Europe/Helsinki')]["screen_on_durationtotal"] < 0.25
assert test_user2.loc[pd.Timestamp("2019-08-31", tz='Europe/Helsinki')]["screen_off_durationtotal"] < 446000
assert test_user2.loc[pd.Timestamp("2019-08-31", tz='Europe/Helsinki')]["group"] == "group1"

features ={sc.screen_duration_min:{"screen_column_name":"screen_status","resample_args":{"rule":"12h"}},
sc.screen_duration_max:{"screen_column_name":"screen_status","resample_args":{"rule":"12h"}},
Expand All @@ -93,4 +95,5 @@ def test_audio_features():
assert test_user.loc[pd.Timestamp("2020-01-09 12:00:00", tz='Europe/Helsinki')]["screen_use_durationmaximum"] < 290
assert test_user2.loc[pd.Timestamp("2019-08-15 12:00:00", tz='Europe/Helsinki')]["screen_on_durationmedian"] < 18.5
assert test_user2.loc[pd.Timestamp("2019-08-15 12:00:00", tz='Europe/Helsinki')]["screen_use_durationmedian"] < 0.35
assert test_user2.loc[pd.Timestamp("2019-08-15 12:00:00", tz='Europe/Helsinki')]["screen_off_durationmaximum"] < 182350
assert test_user2.loc[pd.Timestamp("2019-08-15 12:00:00", tz='Europe/Helsinki')]["screen_off_durationmaximum"] < 182350
assert test_user2.loc[pd.Timestamp("2019-08-15 12:00:00", tz='Europe/Helsinki')]["group"] == "group1"

0 comments on commit 618d086

Please sign in to comment.