From 785e8a200cac93ee26b2006218a19f55c8fa543b Mon Sep 17 00:00:00 2001 From: Rantaharju Jarno Date: Tue, 29 Oct 2024 13:18:13 +0200 Subject: [PATCH] Remove config from location features --- docs/user_guide/preprocessing/location.ipynb | 378 +++++++++---------- niimpy/preprocessing/location.py | 59 ++- 2 files changed, 213 insertions(+), 224 deletions(-) diff --git a/docs/user_guide/preprocessing/location.ipynb b/docs/user_guide/preprocessing/location.ipynb index 46ff53ac..8316fca3 100644 --- a/docs/user_guide/preprocessing/location.ipynb +++ b/docs/user_guide/preprocessing/location.ipynb @@ -307,7 +307,7 @@ ], "source": [ "binned_data = niimpy.util.aggregate(data, freq='5min', method_numerical='median')\n", - "binned_data = binned_data.reset_index(0).dropna()\n", + "binned_data = binned_data.dropna()\n", "binned_data.shape" ] }, @@ -559,25 +559,25 @@ " \n", " user\n", " n_significant_places\n", - " n_sps\n", - " n_static\n", - " n_moving\n", - " n_rare\n", - " n_home\n", - " max_dist_home\n", " n_transitions\n", + " normalized_entropy\n", " n_top1\n", + " n_top4\n", + " n_rare\n", + " n_top2\n", + " n_moving\n", + " max_dist_home\n", " ...\n", - " n_top5\n", + " n_top3\n", " entropy\n", - " normalized_entropy\n", + " n_sps\n", + " log_variance\n", + " variance\n", " dist_total\n", - " n_bins\n", - " speed_average\n", " speed_variance\n", + " speed_average\n", + " n_bins\n", " speed_max\n", - " variance\n", - " log_variance\n", " \n", " \n", " \n", @@ -585,169 +585,169 @@ " 2013-03-31 00:00:00+02:00\n", " gps_u00\n", " 6\n", - " 5.0\n", - " 280.0\n", - " 8.0\n", - " 3.0\n", - " 106.0\n", - " 2.074186e+04\n", " 48.0\n", + " 3.163631\n", " 106.0\n", + " 20.0\n", + " 3.0\n", + " 99.0\n", + " 8.0\n", + " 2.074186e+04\n", " ...\n", - " 18.0\n", + " 34.0\n", " 5.091668\n", - " 3.163631\n", + " 5.0\n", + " -5.761688\n", + " 0.003146\n", " 4.132581e+05\n", - " 288.0\n", - " 0.033496\n", " 0.044885\n", + " 0.033496\n", + " 288.0\n", " 1.750000\n", - " 0.003146\n", - " -5.761688\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u00\n", " 10\n", - " 10.0\n", - " 1966.0\n", - " 66.0\n", - " 45.0\n", - " 1010.0\n", - " 2.914790e+05\n", " 194.0\n", + " 3.163793\n", " 1016.0\n", + " 45.0\n", + " 45.0\n", + " 668.0\n", + " 66.0\n", + " 2.914790e+05\n", " ...\n", - " 38.0\n", + " 135.0\n", " 7.284903\n", - " 3.163793\n", + " 10.0\n", + " -1.439133\n", + " 0.237133\n", " 2.179693e+06\n", - " 2032.0\n", - " 0.269932\n", " 6.129277\n", + " 0.269932\n", + " 2032.0\n", " 33.250000\n", - " 0.237133\n", - " -1.439133\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u00\n", " 15\n", - " 12.0\n", - " 1827.0\n", - " 76.0\n", - " 86.0\n", - " 1028.0\n", - " 1.041741e+06\n", " 107.0\n", + " 2.696752\n", " 1030.0\n", + " 65.0\n", + " 86.0\n", + " 501.0\n", + " 76.0\n", + " 1.041741e+06\n", " ...\n", - " 46.0\n", + " 86.0\n", " 6.701177\n", - " 2.696752\n", + " 12.0\n", + " 2.114892\n", + " 8.288687\n", " 6.986551e+06\n", - " 1903.0\n", - " 0.351280\n", " 7.590639\n", + " 0.351280\n", + " 1903.0\n", " 34.000000\n", - " 8.288687\n", - " 2.114892\n", " \n", " \n", " 2013-06-30 00:00:00+03:00\n", " gps_u00\n", " 1\n", - " 1.0\n", - " 22.0\n", - " 2.0\n", + " 10.0\n", + " 0.000000\n", " 15.0\n", " 0.0\n", - " 2.035837e+04\n", - " 10.0\n", " 15.0\n", + " 7.0\n", + " 2.0\n", + " 2.035837e+04\n", " ...\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 1.0\n", + " -4.200287\n", + " 0.014991\n", " 2.252893e+05\n", - " 24.0\n", - " 0.044126\n", " 0.021490\n", + " 0.044126\n", + " 24.0\n", " 0.559017\n", - " 0.014991\n", - " -4.200287\n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u01\n", " 4\n", - " 2.0\n", - " 307.0\n", - " 18.0\n", - " 0.0\n", - " 260.0\n", - " 6.975303e+02\n", " 8.0\n", + " 4.392317\n", " 286.0\n", + " 0.0\n", + " 0.0\n", + " 21.0\n", + " 18.0\n", + " 6.975303e+02\n", " ...\n", " 0.0\n", " 3.044522\n", - " 4.392317\n", + " 2.0\n", + " -12.520989\n", + " 0.000004\n", " 1.328713e+04\n", - " 325.0\n", - " 0.056290\n", " 0.073370\n", + " 0.056290\n", + " 325.0\n", " 2.692582\n", - " 0.000004\n", - " -12.520989\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u01\n", " 4\n", + " 2.0\n", + " 0.000000\n", + " 1998.0\n", + " 0.0\n", " 1.0\n", - " 1999.0\n", - " 71.0\n", " 1.0\n", - " 1500.0\n", + " 71.0\n", " 1.156568e+04\n", - " 2.0\n", - " 1998.0\n", " ...\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 1.0\n", + " -10.510017\n", + " 0.000027\n", " 1.238429e+05\n", - " 2070.0\n", - " 0.066961\n", " 0.629393\n", + " 0.066961\n", + " 2070.0\n", " 32.750000\n", - " 0.000027\n", - " -10.510017\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u01\n", " 2\n", + " 2.0\n", + " 0.000000\n", + " 3078.0\n", + " 0.0\n", " 1.0\n", - " 3079.0\n", - " 34.0\n", " 1.0\n", - " 45.0\n", + " 34.0\n", " 3.957650e+03\n", - " 2.0\n", - " 3078.0\n", " ...\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 1.0\n", + " -11.364454\n", + " 0.000012\n", " 1.228235e+05\n", - " 3113.0\n", - " 0.026392\n", " 0.261978\n", + " 0.026392\n", + " 3113.0\n", " 20.250000\n", - " 0.000012\n", - " -11.364454\n", " \n", " \n", "\n", @@ -755,59 +755,50 @@ "" ], "text/plain": [ - " user n_significant_places n_sps n_static \\\n", - "2013-03-31 00:00:00+02:00 gps_u00 6 5.0 280.0 \n", - "2013-04-30 00:00:00+03:00 gps_u00 10 10.0 1966.0 \n", - "2013-05-31 00:00:00+03:00 gps_u00 15 12.0 1827.0 \n", - "2013-06-30 00:00:00+03:00 gps_u00 1 1.0 22.0 \n", - "2013-03-31 00:00:00+02:00 gps_u01 4 2.0 307.0 \n", - "2013-04-30 00:00:00+03:00 gps_u01 4 1.0 1999.0 \n", - "2013-05-31 00:00:00+03:00 gps_u01 2 1.0 3079.0 \n", + " user n_significant_places n_transitions \\\n", + "2013-03-31 00:00:00+02:00 gps_u00 6 48.0 \n", + "2013-04-30 00:00:00+03:00 gps_u00 10 194.0 \n", + "2013-05-31 00:00:00+03:00 gps_u00 15 107.0 \n", + "2013-06-30 00:00:00+03:00 gps_u00 1 10.0 \n", + "2013-03-31 00:00:00+02:00 gps_u01 4 8.0 \n", + "2013-04-30 00:00:00+03:00 gps_u01 4 2.0 \n", + "2013-05-31 00:00:00+03:00 gps_u01 2 2.0 \n", "\n", - " n_moving n_rare n_home max_dist_home \\\n", - "2013-03-31 00:00:00+02:00 8.0 3.0 106.0 2.074186e+04 \n", - "2013-04-30 00:00:00+03:00 66.0 45.0 1010.0 2.914790e+05 \n", - "2013-05-31 00:00:00+03:00 76.0 86.0 1028.0 1.041741e+06 \n", - "2013-06-30 00:00:00+03:00 2.0 15.0 0.0 2.035837e+04 \n", - "2013-03-31 00:00:00+02:00 18.0 0.0 260.0 6.975303e+02 \n", - "2013-04-30 00:00:00+03:00 71.0 1.0 1500.0 1.156568e+04 \n", - "2013-05-31 00:00:00+03:00 34.0 1.0 45.0 3.957650e+03 \n", + " normalized_entropy n_top1 n_top4 n_rare n_top2 \\\n", + "2013-03-31 00:00:00+02:00 3.163631 106.0 20.0 3.0 99.0 \n", + "2013-04-30 00:00:00+03:00 3.163793 1016.0 45.0 45.0 668.0 \n", + "2013-05-31 00:00:00+03:00 2.696752 1030.0 65.0 86.0 501.0 \n", + "2013-06-30 00:00:00+03:00 0.000000 15.0 0.0 15.0 7.0 \n", + "2013-03-31 00:00:00+02:00 4.392317 286.0 0.0 0.0 21.0 \n", + "2013-04-30 00:00:00+03:00 0.000000 1998.0 0.0 1.0 1.0 \n", + "2013-05-31 00:00:00+03:00 0.000000 3078.0 0.0 1.0 1.0 \n", "\n", - " n_transitions n_top1 ... n_top5 entropy \\\n", - "2013-03-31 00:00:00+02:00 48.0 106.0 ... 18.0 5.091668 \n", - "2013-04-30 00:00:00+03:00 194.0 1016.0 ... 38.0 7.284903 \n", - "2013-05-31 00:00:00+03:00 107.0 1030.0 ... 46.0 6.701177 \n", - "2013-06-30 00:00:00+03:00 10.0 15.0 ... 0.0 0.000000 \n", - "2013-03-31 00:00:00+02:00 8.0 286.0 ... 0.0 3.044522 \n", - "2013-04-30 00:00:00+03:00 2.0 1998.0 ... 0.0 0.000000 \n", - "2013-05-31 00:00:00+03:00 2.0 3078.0 ... 0.0 0.000000 \n", + " n_moving max_dist_home ... n_top3 entropy \\\n", + "2013-03-31 00:00:00+02:00 8.0 2.074186e+04 ... 34.0 5.091668 \n", + "2013-04-30 00:00:00+03:00 66.0 2.914790e+05 ... 135.0 7.284903 \n", + "2013-05-31 00:00:00+03:00 76.0 1.041741e+06 ... 86.0 6.701177 \n", + "2013-06-30 00:00:00+03:00 2.0 2.035837e+04 ... 0.0 0.000000 \n", + "2013-03-31 00:00:00+02:00 18.0 6.975303e+02 ... 0.0 3.044522 \n", + "2013-04-30 00:00:00+03:00 71.0 1.156568e+04 ... 0.0 0.000000 \n", + "2013-05-31 00:00:00+03:00 34.0 3.957650e+03 ... 0.0 0.000000 \n", "\n", - " normalized_entropy dist_total n_bins \\\n", - "2013-03-31 00:00:00+02:00 3.163631 4.132581e+05 288.0 \n", - "2013-04-30 00:00:00+03:00 3.163793 2.179693e+06 2032.0 \n", - "2013-05-31 00:00:00+03:00 2.696752 6.986551e+06 1903.0 \n", - "2013-06-30 00:00:00+03:00 0.000000 2.252893e+05 24.0 \n", - "2013-03-31 00:00:00+02:00 4.392317 1.328713e+04 325.0 \n", - "2013-04-30 00:00:00+03:00 0.000000 1.238429e+05 2070.0 \n", - "2013-05-31 00:00:00+03:00 0.000000 1.228235e+05 3113.0 \n", + " n_sps log_variance variance dist_total \\\n", + "2013-03-31 00:00:00+02:00 5.0 -5.761688 0.003146 4.132581e+05 \n", + "2013-04-30 00:00:00+03:00 10.0 -1.439133 0.237133 2.179693e+06 \n", + "2013-05-31 00:00:00+03:00 12.0 2.114892 8.288687 6.986551e+06 \n", + "2013-06-30 00:00:00+03:00 1.0 -4.200287 0.014991 2.252893e+05 \n", + "2013-03-31 00:00:00+02:00 2.0 -12.520989 0.000004 1.328713e+04 \n", + "2013-04-30 00:00:00+03:00 1.0 -10.510017 0.000027 1.238429e+05 \n", + "2013-05-31 00:00:00+03:00 1.0 -11.364454 0.000012 1.228235e+05 \n", "\n", - " speed_average speed_variance speed_max variance \\\n", - "2013-03-31 00:00:00+02:00 0.033496 0.044885 1.750000 0.003146 \n", - "2013-04-30 00:00:00+03:00 0.269932 6.129277 33.250000 0.237133 \n", - "2013-05-31 00:00:00+03:00 0.351280 7.590639 34.000000 8.288687 \n", - "2013-06-30 00:00:00+03:00 0.044126 0.021490 0.559017 0.014991 \n", - "2013-03-31 00:00:00+02:00 0.056290 0.073370 2.692582 0.000004 \n", - "2013-04-30 00:00:00+03:00 0.066961 0.629393 32.750000 0.000027 \n", - "2013-05-31 00:00:00+03:00 0.026392 0.261978 20.250000 0.000012 \n", - "\n", - " log_variance \n", - "2013-03-31 00:00:00+02:00 -5.761688 \n", - "2013-04-30 00:00:00+03:00 -1.439133 \n", - "2013-05-31 00:00:00+03:00 2.114892 \n", - "2013-06-30 00:00:00+03:00 -4.200287 \n", - "2013-03-31 00:00:00+02:00 -12.520989 \n", - "2013-04-30 00:00:00+03:00 -10.510017 \n", - "2013-05-31 00:00:00+03:00 -11.364454 \n", + " speed_variance speed_average n_bins speed_max \n", + "2013-03-31 00:00:00+02:00 0.044885 0.033496 288.0 1.750000 \n", + "2013-04-30 00:00:00+03:00 6.129277 0.269932 2032.0 33.250000 \n", + "2013-05-31 00:00:00+03:00 7.590639 0.351280 1903.0 34.000000 \n", + "2013-06-30 00:00:00+03:00 0.021490 0.044126 24.0 0.559017 \n", + "2013-03-31 00:00:00+02:00 0.073370 0.056290 325.0 2.692582 \n", + "2013-04-30 00:00:00+03:00 0.629393 0.066961 2070.0 32.750000 \n", + "2013-05-31 00:00:00+03:00 0.261978 0.026392 3113.0 20.250000 \n", "\n", "[7 rows x 23 columns]" ] @@ -821,7 +812,8 @@ "import warnings\n", "warnings.filterwarnings('ignore', category=RuntimeWarning)\n", "\n", - "# extract all the available features\n", + "# extract all the availa\n", + "# ble features\n", "all_features = nilo.extract_features_location(binned_data)\n", "all_features" ] @@ -854,115 +846,115 @@ " \n", " \n", " user\n", + " log_variance\n", + " variance\n", " dist_total\n", - " n_bins\n", - " speed_average\n", " speed_variance\n", + " speed_average\n", + " n_bins\n", " speed_max\n", - " variance\n", - " log_variance\n", " \n", " \n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u00\n", + " -5.761688\n", + " 0.003146\n", " 4.132581e+05\n", - " 288.0\n", - " 0.033496\n", " 0.044885\n", + " 0.033496\n", + " 288.0\n", " 1.750000\n", - " 0.003146\n", - " -5.761688\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u00\n", + " -1.439133\n", + " 0.237133\n", " 2.179693e+06\n", - " 2032.0\n", - " 0.269932\n", " 6.129277\n", + " 0.269932\n", + " 2032.0\n", " 33.250000\n", - " 0.237133\n", - " -1.439133\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u00\n", + " 2.114892\n", + " 8.288687\n", " 6.986551e+06\n", - " 1903.0\n", - " 0.351280\n", " 7.590639\n", + " 0.351280\n", + " 1903.0\n", " 34.000000\n", - " 8.288687\n", - " 2.114892\n", " \n", " \n", " 2013-06-30 00:00:00+03:00\n", " gps_u00\n", + " -4.200287\n", + " 0.014991\n", " 2.252893e+05\n", - " 24.0\n", - " 0.044126\n", " 0.021490\n", + " 0.044126\n", + " 24.0\n", " 0.559017\n", - " 0.014991\n", - " -4.200287\n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u01\n", + " -12.520989\n", + " 0.000004\n", " 1.328713e+04\n", - " 325.0\n", - " 0.056290\n", " 0.073370\n", + " 0.056290\n", + " 325.0\n", " 2.692582\n", - " 0.000004\n", - " -12.520989\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u01\n", + " -10.510017\n", + " 0.000027\n", " 1.238429e+05\n", - " 2070.0\n", - " 0.066961\n", " 0.629393\n", + " 0.066961\n", + " 2070.0\n", " 32.750000\n", - " 0.000027\n", - " -10.510017\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u01\n", + " -11.364454\n", + " 0.000012\n", " 1.228235e+05\n", - " 3113.0\n", - " 0.026392\n", " 0.261978\n", + " 0.026392\n", + " 3113.0\n", " 20.250000\n", - " 0.000012\n", - " -11.364454\n", " \n", " \n", "\n", "" ], "text/plain": [ - " user dist_total n_bins speed_average \\\n", - "2013-03-31 00:00:00+02:00 gps_u00 4.132581e+05 288.0 0.033496 \n", - "2013-04-30 00:00:00+03:00 gps_u00 2.179693e+06 2032.0 0.269932 \n", - "2013-05-31 00:00:00+03:00 gps_u00 6.986551e+06 1903.0 0.351280 \n", - "2013-06-30 00:00:00+03:00 gps_u00 2.252893e+05 24.0 0.044126 \n", - "2013-03-31 00:00:00+02:00 gps_u01 1.328713e+04 325.0 0.056290 \n", - "2013-04-30 00:00:00+03:00 gps_u01 1.238429e+05 2070.0 0.066961 \n", - "2013-05-31 00:00:00+03:00 gps_u01 1.228235e+05 3113.0 0.026392 \n", + " user log_variance variance dist_total \\\n", + "2013-03-31 00:00:00+02:00 gps_u00 -5.761688 0.003146 4.132581e+05 \n", + "2013-04-30 00:00:00+03:00 gps_u00 -1.439133 0.237133 2.179693e+06 \n", + "2013-05-31 00:00:00+03:00 gps_u00 2.114892 8.288687 6.986551e+06 \n", + "2013-06-30 00:00:00+03:00 gps_u00 -4.200287 0.014991 2.252893e+05 \n", + "2013-03-31 00:00:00+02:00 gps_u01 -12.520989 0.000004 1.328713e+04 \n", + "2013-04-30 00:00:00+03:00 gps_u01 -10.510017 0.000027 1.238429e+05 \n", + "2013-05-31 00:00:00+03:00 gps_u01 -11.364454 0.000012 1.228235e+05 \n", "\n", - " speed_variance speed_max variance log_variance \n", - "2013-03-31 00:00:00+02:00 0.044885 1.750000 0.003146 -5.761688 \n", - "2013-04-30 00:00:00+03:00 6.129277 33.250000 0.237133 -1.439133 \n", - "2013-05-31 00:00:00+03:00 7.590639 34.000000 8.288687 2.114892 \n", - "2013-06-30 00:00:00+03:00 0.021490 0.559017 0.014991 -4.200287 \n", - "2013-03-31 00:00:00+02:00 0.073370 2.692582 0.000004 -12.520989 \n", - "2013-04-30 00:00:00+03:00 0.629393 32.750000 0.000027 -10.510017 \n", - "2013-05-31 00:00:00+03:00 0.261978 20.250000 0.000012 -11.364454 " + " speed_variance speed_average n_bins speed_max \n", + "2013-03-31 00:00:00+02:00 0.044885 0.033496 288.0 1.750000 \n", + "2013-04-30 00:00:00+03:00 6.129277 0.269932 2032.0 33.250000 \n", + "2013-05-31 00:00:00+03:00 7.590639 0.351280 1903.0 34.000000 \n", + "2013-06-30 00:00:00+03:00 0.021490 0.044126 24.0 0.559017 \n", + "2013-03-31 00:00:00+02:00 0.073370 0.056290 325.0 2.692582 \n", + "2013-04-30 00:00:00+03:00 0.629393 0.066961 2070.0 32.750000 \n", + "2013-05-31 00:00:00+03:00 0.261978 0.026392 3113.0 20.250000 " ] }, "execution_count": 8, @@ -1017,9 +1009,9 @@ { "data": { "text/plain": [ - "{: {'resample_args': {'rule': '1M'}},\n", - " : {'resample_args': {'rule': '1M'}},\n", - " : {'resample_args': {'rule': '1M'}}}" + "{: {},\n", + " : {},\n", + " : {}}" ] }, "execution_count": 9, @@ -1203,7 +1195,7 @@ ], "source": [ "# customized function\n", - "def max_speed(df, feature_arg):\n", + "def max_speed(df):\n", " grouped = df.groupby('user')\n", " df = grouped['double_speed'].max().reset_index('user')\n", " return df\n", @@ -1232,7 +1224,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/niimpy/preprocessing/location.py b/niimpy/preprocessing/location.py index 2ec7249f..7101a3f1 100644 --- a/niimpy/preprocessing/location.py +++ b/niimpy/preprocessing/location.py @@ -261,16 +261,15 @@ def number_of_significant_places(lats, lons, times): return np.nanmedian(sps) -def location_number_of_significant_places(df, config=None): +def location_number_of_significant_places( + df, + latitude_column="double_latitude", + longitude_column="double_longitude", + resample_args={"rule": default_freq}, + **kwargs + ): """Computes number of significant places """ assert isinstance(df, pd.DataFrame), "df_u is not a pandas dataframe" - if config is None: - config = {} - assert isinstance(config, dict), "config is not a dictionary" - - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_longitude") - config["resample_args"] = config.get("resample_args", {"rule": default_freq}) def compute_features(df): df = df.sort_index() # sort based on time @@ -288,7 +287,7 @@ def compute_features(df): }) return row - result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) + result = util.group_data(df).resample(**resample_args, include_groups=False).apply(compute_features) result = util.reset_groups(result) result = util.select_columns(result, ["n_significant_places"]) return result @@ -326,7 +325,15 @@ def compute_nbin_maxdist_home(lats, lons, latlon_home, home_radius=50): return time_home, max_dist_home -def location_significant_place_features(df, config=None): +def location_significant_place_features( + df, + latitude_column="double_latitude", + longitude_column="double_latitude", + speed_column="double_speed", + speed_threshold=0.277, + resample_args={"rule": default_freq}, + **kwargs + ): """Calculates features related to Significant Places. Parameters @@ -341,15 +348,6 @@ def location_significant_place_features(df, config=None): resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}. """ assert isinstance(df, pd.DataFrame), "df_u is not a pandas dataframe" - if config is None: - config = {} - assert isinstance(config, dict), "config is not a dictionary" - - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_latitude") - speed_column = config.get("speed_column", "double_speed") - speed_threshold = config.get("speed_threshold", 0.277) - config["resample_args"] = config.get("resample_args", {"rule": default_freq}) def compute_features(df): """Compute features for a single user""" @@ -421,13 +419,20 @@ def compute_features(df): }) return row - result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) + result = util.group_data(df).resample(**resample_args, include_groups=False).apply(compute_features) result = util.reset_groups(result) result = util.select_columns(result, ["n_sps", "n_static", "n_moving", "n_rare", "n_home", "max_dist_home", "n_transitions", "n_top1", "n_top2", "n_top3", "n_top4", "n_top5", "entropy", "normalized_entropy"]) return result -def location_distance_features(df, config=None): +def location_distance_features( + df, + latitude_column="double_latitude", + longitude_column="double_latitude", + speed_column="double_speed", + resample_args={"rule": default_freq}, + **kwargs + ): """Calculates features related to distance and speed. Parameters @@ -442,14 +447,6 @@ def location_distance_features(df, config=None): resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}. """ assert isinstance(df, pd.DataFrame), "df_u is not a pandas dataframe" - if config is None: - config = {} - assert isinstance(config, dict), "config is not a dictionary" - - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_latitude") - speed_column = config.get("speed_column", "double_speed") - config["resample_args"] = config.get("resample_args", {"rule": default_freq}) def compute_features(df): """Compute features for a single user and given time interval""" @@ -488,7 +485,7 @@ def compute_features(df): }) return row - result = util.group_data(df).resample(**config["resample_args"], include_groups=False).apply(compute_features) + result = util.group_data(df).resample(**resample_args, include_groups=False).apply(compute_features) result = util.reset_groups(result) result = util.select_columns(result, ["dist_total", "n_bins", "speed_average", "speed_variance", "speed_max", "variance", "log_variance"]) return result @@ -533,7 +530,7 @@ def extract_features_location(df, features=None): computed_features = [] for features, feature_arg in features.items(): - computed_feature = features(df, feature_arg) + computed_feature = features(df, **feature_arg) computed_feature = util.set_conserved_index(computed_feature) computed_features.append(computed_feature)