diff --git a/README.md b/README.md index d859d5da..ee8dd328 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,8 @@ location = location.reset_index(0).dropna() # Feature extraction features = nilo.extract_features( - lats=location['double_latitude'], - lons=location['double_longitude'], + lats=location['latitude'], + lons=location['longitude'], users=location['user'], groups=location['group'], times=location.index, diff --git a/docs/user_guide/preprocessing/location.ipynb b/docs/user_guide/preprocessing/location.ipynb index 46ff53ac..8339c898 100644 --- a/docs/user_guide/preprocessing/location.ipynb +++ b/docs/user_guide/preprocessing/location.ipynb @@ -22,8 +22,8 @@ "Location data is expected to have the following columns (column names can be different, but in that case they must be provided as parameters):\n", "- `user`: Subject ID\n", "- `device`: Device ID\n", - "- `double_latitude`: Latitude as a floating point number\n", - "- `double_longitude`: Longitude as a floating point number\n", + "- `latitude`: Latitude as a floating point number\n", + "- `longitude`: Longitude as a floating point number\n", "\n", "Optional columns include:\n", "- `double_speed`: Speed measured at the location\n", @@ -208,7 +208,120 @@ "id": "86e7396c", "metadata": {}, "source": [ - "The necessary columns for further analysis are `double_latitude`, `double_longitude`, `double_speed`, and `user`. `user` refers to a unique identifier for a subject." + "For further analysis we need a `latitude`, `longitude`, `speed`, and `user` column. `user` refers to a unique identifier for a subject.\n", + "\n", + "These columsn exist in the data, but some column names are different. We could provide these column names as arguments, but it is easier to rename them here:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df3a2a0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timelatitudelongitudespeeduserdatetime
2013-03-27 06:03:29+02:00136435700943.706667-72.2890970.00gps_u012013-03-27 06:03:29+02:00
2013-03-27 06:23:29+02:00136435820943.706637-72.2890660.00gps_u012013-03-27 06:23:29+02:00
2013-03-27 06:43:25+02:00136435940543.706678-72.2890180.25gps_u012013-03-27 06:43:25+02:00
2013-03-27 07:03:29+02:00136436060943.706665-72.2890870.00gps_u012013-03-27 07:03:29+02:00
2013-03-27 07:23:25+02:00136436180543.706808-72.2893700.00gps_u012013-03-27 07:23:25+02:00
\n", + "
" + ], + "text/plain": [ + " time latitude longitude speed user \\\n", + "2013-03-27 06:03:29+02:00 1364357009 43.706667 -72.289097 0.00 gps_u01 \n", + "2013-03-27 06:23:29+02:00 1364358209 43.706637 -72.289066 0.00 gps_u01 \n", + "2013-03-27 06:43:25+02:00 1364359405 43.706678 -72.289018 0.25 gps_u01 \n", + "2013-03-27 07:03:29+02:00 1364360609 43.706665 -72.289087 0.00 gps_u01 \n", + "2013-03-27 07:23:25+02:00 1364361805 43.706808 -72.289370 0.00 gps_u01 \n", + "\n", + " datetime \n", + "2013-03-27 06:03:29+02:00 2013-03-27 06:03:29+02:00 \n", + "2013-03-27 06:23:29+02:00 2013-03-27 06:23:29+02:00 \n", + "2013-03-27 06:43:25+02:00 2013-03-27 06:43:25+02:00 \n", + "2013-03-27 07:03:29+02:00 2013-03-27 07:03:29+02:00 \n", + "2013-03-27 07:23:25+02:00 2013-03-27 07:23:25+02:00 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data.rename(columns={\"double_latitude\": \"latitude\", \"double_longitude\": \"longitude\", \"double_speed\": \"speed\"})\n", + "data.head()" ] }, { @@ -235,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "a96bdaa6", "metadata": {}, "outputs": [ @@ -245,7 +358,7 @@ "(9857, 6)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -290,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "01aefd90", "metadata": {}, "outputs": [ @@ -300,20 +413,20 @@ "(9755, 5)" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binned_data = niimpy.util.aggregate(data, freq='5min', method_numerical='median')\n", - "binned_data = binned_data.reset_index(0).dropna()\n", + "binned_data = binned_data.dropna()\n", "binned_data.shape" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "d7027bec", "metadata": {}, "outputs": [ @@ -340,9 +453,9 @@ " \n", " user\n", " time\n", - " double_latitude\n", - " double_longitude\n", - " double_speed\n", + " latitude\n", + " longitude\n", + " speed\n", " \n", " \n", " \n", @@ -386,96 +499,26 @@ " -72.329240\n", " 0.0\n", " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 2013-05-29 16:10:00+03:00\n", - " gps_u01\n", - " 1.369833e+09\n", - " 43.706711\n", - " -72.289205\n", - " 0.0\n", - " \n", - " \n", - " 2013-05-29 16:20:00+03:00\n", - " gps_u01\n", - " 1.369834e+09\n", - " 43.706708\n", - " -72.289162\n", - " 0.0\n", - " \n", - " \n", - " 2013-05-29 16:30:00+03:00\n", - " gps_u01\n", - " 1.369834e+09\n", - " 43.706725\n", - " -72.289149\n", - " 0.0\n", - " \n", - " \n", - " 2013-05-29 16:40:00+03:00\n", - " gps_u01\n", - " 1.369835e+09\n", - " 43.706697\n", - " -72.289165\n", - " 0.0\n", - " \n", - " \n", - " 2013-05-29 16:50:00+03:00\n", - " gps_u01\n", - " 1.369836e+09\n", - " 43.706713\n", - " -72.289191\n", - " 0.0\n", - " \n", " \n", "\n", - "

9755 rows × 5 columns

\n", "" ], "text/plain": [ - " user time double_latitude \\\n", - "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 \n", - "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 \n", - "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 \n", - "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 \n", - "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 \n", - "... ... ... ... \n", - "2013-05-29 16:10:00+03:00 gps_u01 1.369833e+09 43.706711 \n", - "2013-05-29 16:20:00+03:00 gps_u01 1.369834e+09 43.706708 \n", - "2013-05-29 16:30:00+03:00 gps_u01 1.369834e+09 43.706725 \n", - "2013-05-29 16:40:00+03:00 gps_u01 1.369835e+09 43.706697 \n", - "2013-05-29 16:50:00+03:00 gps_u01 1.369836e+09 43.706713 \n", - "\n", - " double_longitude double_speed \n", - "2013-03-27 06:00:00+02:00 -72.329240 0.0 \n", - "2013-03-27 06:20:00+02:00 -72.329018 0.0 \n", - "2013-03-27 06:40:00+02:00 -72.329238 0.0 \n", - "2013-03-27 07:00:00+02:00 -72.329240 0.0 \n", - "2013-03-27 07:20:00+02:00 -72.329240 0.0 \n", - "... ... ... \n", - "2013-05-29 16:10:00+03:00 -72.289205 0.0 \n", - "2013-05-29 16:20:00+03:00 -72.289162 0.0 \n", - "2013-05-29 16:30:00+03:00 -72.289149 0.0 \n", - "2013-05-29 16:40:00+03:00 -72.289165 0.0 \n", - "2013-05-29 16:50:00+03:00 -72.289191 0.0 \n", - "\n", - "[9755 rows x 5 columns]" + " user time latitude longitude speed\n", + "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 -72.329240 0.0\n", + "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 -72.329018 0.0\n", + "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 -72.329238 0.0\n", + "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 -72.329240 0.0\n", + "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 -72.329240 0.0" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "binned_data" + "binned_data.head()" ] }, { @@ -532,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "5bf0185c", "metadata": {}, "outputs": [ @@ -559,25 +602,25 @@ " \n", " user\n", " n_significant_places\n", + " n_home\n", " n_sps\n", - " n_static\n", + " normalized_entropy\n", " n_moving\n", - " n_rare\n", - " n_home\n", + " n_top4\n", " max_dist_home\n", - " n_transitions\n", - " n_top1\n", - " ...\n", " n_top5\n", + " n_rare\n", + " ...\n", + " n_top1\n", " entropy\n", - " normalized_entropy\n", + " n_transitions\n", + " variance\n", + " log_variance\n", " dist_total\n", - " n_bins\n", " speed_average\n", - " speed_variance\n", " speed_max\n", - " variance\n", - " log_variance\n", + " n_bins\n", + " speed_variance\n", " \n", " \n", " \n", @@ -585,169 +628,169 @@ " 2013-03-31 00:00:00+02:00\n", " gps_u00\n", " 6\n", + " 97.0\n", " 5.0\n", - " 280.0\n", - " 8.0\n", - " 3.0\n", - " 106.0\n", + " 3.006920\n", + " 54.0\n", + " 18.0\n", " 2.074186e+04\n", - " 48.0\n", - " 106.0\n", + " 8.0\n", + " 0.0\n", " ...\n", - " 18.0\n", - " 5.091668\n", - " 3.163631\n", - " 4.132581e+05\n", - " 288.0\n", - " 0.033496\n", - " 0.044885\n", - " 1.750000\n", + " 98.0\n", + " 4.839451\n", + " 25.0\n", " 0.003146\n", " -5.761688\n", + " 4.132581e+05\n", + " 1.116127\n", + " 17.284037\n", + " 288.0\n", + " 9.876405\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u00\n", " 10\n", - " 10.0\n", - " 1966.0\n", - " 66.0\n", - " 45.0\n", - " 1010.0\n", + " 969.0\n", + " 8.0\n", + " 3.045317\n", + " 318.0\n", + " 37.0\n", " 2.914790e+05\n", - " 194.0\n", - " 1016.0\n", + " 18.0\n", + " 18.0\n", " ...\n", - " 38.0\n", - " 7.284903\n", - " 3.163793\n", - " 2.179693e+06\n", - " 2032.0\n", - " 0.269932\n", - " 6.129277\n", - " 33.250000\n", + " 976.0\n", + " 6.332559\n", + " 97.0\n", " 0.237133\n", " -1.439133\n", + " 2.179693e+06\n", + " 0.821680\n", + " 33.831053\n", + " 2032.0\n", + " 9.970465\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u00\n", " 15\n", - " 12.0\n", - " 1827.0\n", - " 76.0\n", - " 86.0\n", - " 1028.0\n", + " 1007.0\n", + " 9.0\n", + " 3.110317\n", + " 255.0\n", + " 43.0\n", " 1.041741e+06\n", - " 107.0\n", - " 1030.0\n", + " 38.0\n", + " 31.0\n", " ...\n", - " 46.0\n", - " 6.701177\n", - " 2.696752\n", - " 6.986551e+06\n", - " 1903.0\n", - " 0.351280\n", - " 7.590639\n", - " 34.000000\n", + " 1009.0\n", + " 6.834065\n", + " 69.0\n", " 8.288687\n", " 2.114892\n", + " 6.986551e+06\n", + " 0.847341\n", + " 42.507751\n", + " 1903.0\n", + " 15.081070\n", " \n", " \n", " 2013-06-30 00:00:00+03:00\n", " gps_u00\n", " 1\n", - " 1.0\n", - " 22.0\n", - " 2.0\n", - " 15.0\n", " 0.0\n", - " 2.035837e+04\n", - " 10.0\n", - " 15.0\n", - " ...\n", " 0.0\n", " 0.000000\n", + " 18.0\n", + " 0.0\n", + " 1.989381e+04\n", + " 0.0\n", + " 6.0\n", + " ...\n", + " 6.0\n", " 0.000000\n", - " 2.252893e+05\n", - " 24.0\n", - " 0.044126\n", - " 0.021490\n", - " 0.559017\n", + " 0.0\n", " 0.014991\n", " -4.200287\n", + " 2.252893e+05\n", + " 14.601880\n", + " 43.321397\n", + " 24.0\n", + " 242.791725\n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u01\n", " 4\n", + " 273.0\n", " 2.0\n", - " 307.0\n", - " 18.0\n", + " 2.584963\n", + " 12.0\n", " 0.0\n", - " 260.0\n", " 6.975303e+02\n", - " 8.0\n", - " 286.0\n", - " ...\n", " 0.0\n", - " 3.044522\n", - " 4.392317\n", - " 1.328713e+04\n", - " 325.0\n", - " 0.056290\n", - " 0.073370\n", - " 2.692582\n", + " 0.0\n", + " ...\n", + " 307.0\n", + " 1.791759\n", + " 4.0\n", " 0.000004\n", " -12.520989\n", + " 1.328713e+04\n", + " 0.029994\n", + " 0.744393\n", + " 325.0\n", + " 0.008880\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u01\n", " 4\n", - " 1.0\n", - " 1999.0\n", - " 71.0\n", - " 1.0\n", - " 1500.0\n", - " 1.156568e+04\n", + " 1492.0\n", " 2.0\n", - " 1998.0\n", - " ...\n", + " 5.977280\n", + " 78.0\n", " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 1.238429e+05\n", - " 2070.0\n", - " 0.066961\n", - " 0.629393\n", - " 32.750000\n", + " 1.156568e+04\n", + " 0.0\n", + " 1.0\n", + " ...\n", + " 1928.0\n", + " 4.143135\n", + " 26.0\n", " 0.000027\n", " -10.510017\n", + " 1.238429e+05\n", + " 0.050416\n", + " 16.992157\n", + " 2070.0\n", + " 0.237219\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u01\n", " 2\n", + " 42.0\n", " 1.0\n", - " 3079.0\n", - " 34.0\n", - " 1.0\n", - " 45.0\n", - " 3.957650e+03\n", - " 2.0\n", - " 3078.0\n", - " ...\n", - " 0.0\n", " 0.000000\n", + " 110.0\n", + " 0.0\n", + " 6.771047e+02\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 3003.0\n", " 0.000000\n", - " 1.228235e+05\n", - " 3113.0\n", - " 0.026392\n", - " 0.261978\n", - " 20.250000\n", + " 0.0\n", " 0.000012\n", " -11.364454\n", + " 1.228235e+05\n", + " 0.044657\n", + " 9.967899\n", + " 3113.0\n", + " 0.085366\n", " \n", " \n", "\n", @@ -755,64 +798,64 @@ "" ], "text/plain": [ - " user n_significant_places n_sps n_static \\\n", - "2013-03-31 00:00:00+02:00 gps_u00 6 5.0 280.0 \n", - "2013-04-30 00:00:00+03:00 gps_u00 10 10.0 1966.0 \n", - "2013-05-31 00:00:00+03:00 gps_u00 15 12.0 1827.0 \n", - "2013-06-30 00:00:00+03:00 gps_u00 1 1.0 22.0 \n", - "2013-03-31 00:00:00+02:00 gps_u01 4 2.0 307.0 \n", - "2013-04-30 00:00:00+03:00 gps_u01 4 1.0 1999.0 \n", - "2013-05-31 00:00:00+03:00 gps_u01 2 1.0 3079.0 \n", + " user n_significant_places n_home n_sps \\\n", + "2013-03-31 00:00:00+02:00 gps_u00 6 97.0 5.0 \n", + "2013-04-30 00:00:00+03:00 gps_u00 10 969.0 8.0 \n", + "2013-05-31 00:00:00+03:00 gps_u00 15 1007.0 9.0 \n", + "2013-06-30 00:00:00+03:00 gps_u00 1 0.0 0.0 \n", + "2013-03-31 00:00:00+02:00 gps_u01 4 273.0 2.0 \n", + "2013-04-30 00:00:00+03:00 gps_u01 4 1492.0 2.0 \n", + "2013-05-31 00:00:00+03:00 gps_u01 2 42.0 1.0 \n", "\n", - " n_moving n_rare n_home max_dist_home \\\n", - "2013-03-31 00:00:00+02:00 8.0 3.0 106.0 2.074186e+04 \n", - "2013-04-30 00:00:00+03:00 66.0 45.0 1010.0 2.914790e+05 \n", - "2013-05-31 00:00:00+03:00 76.0 86.0 1028.0 1.041741e+06 \n", - "2013-06-30 00:00:00+03:00 2.0 15.0 0.0 2.035837e+04 \n", - "2013-03-31 00:00:00+02:00 18.0 0.0 260.0 6.975303e+02 \n", - "2013-04-30 00:00:00+03:00 71.0 1.0 1500.0 1.156568e+04 \n", - "2013-05-31 00:00:00+03:00 34.0 1.0 45.0 3.957650e+03 \n", + " normalized_entropy n_moving n_top4 \\\n", + "2013-03-31 00:00:00+02:00 3.006920 54.0 18.0 \n", + "2013-04-30 00:00:00+03:00 3.045317 318.0 37.0 \n", + "2013-05-31 00:00:00+03:00 3.110317 255.0 43.0 \n", + "2013-06-30 00:00:00+03:00 0.000000 18.0 0.0 \n", + "2013-03-31 00:00:00+02:00 2.584963 12.0 0.0 \n", + "2013-04-30 00:00:00+03:00 5.977280 78.0 0.0 \n", + "2013-05-31 00:00:00+03:00 0.000000 110.0 0.0 \n", "\n", - " n_transitions n_top1 ... n_top5 entropy \\\n", - "2013-03-31 00:00:00+02:00 48.0 106.0 ... 18.0 5.091668 \n", - "2013-04-30 00:00:00+03:00 194.0 1016.0 ... 38.0 7.284903 \n", - "2013-05-31 00:00:00+03:00 107.0 1030.0 ... 46.0 6.701177 \n", - "2013-06-30 00:00:00+03:00 10.0 15.0 ... 0.0 0.000000 \n", - "2013-03-31 00:00:00+02:00 8.0 286.0 ... 0.0 3.044522 \n", - "2013-04-30 00:00:00+03:00 2.0 1998.0 ... 0.0 0.000000 \n", - "2013-05-31 00:00:00+03:00 2.0 3078.0 ... 0.0 0.000000 \n", + " max_dist_home n_top5 n_rare ... n_top1 \\\n", + "2013-03-31 00:00:00+02:00 2.074186e+04 8.0 0.0 ... 98.0 \n", + "2013-04-30 00:00:00+03:00 2.914790e+05 18.0 18.0 ... 976.0 \n", + "2013-05-31 00:00:00+03:00 1.041741e+06 38.0 31.0 ... 1009.0 \n", + "2013-06-30 00:00:00+03:00 1.989381e+04 0.0 6.0 ... 6.0 \n", + "2013-03-31 00:00:00+02:00 6.975303e+02 0.0 0.0 ... 307.0 \n", + "2013-04-30 00:00:00+03:00 1.156568e+04 0.0 1.0 ... 1928.0 \n", + "2013-05-31 00:00:00+03:00 6.771047e+02 0.0 0.0 ... 3003.0 \n", "\n", - " normalized_entropy dist_total n_bins \\\n", - "2013-03-31 00:00:00+02:00 3.163631 4.132581e+05 288.0 \n", - "2013-04-30 00:00:00+03:00 3.163793 2.179693e+06 2032.0 \n", - "2013-05-31 00:00:00+03:00 2.696752 6.986551e+06 1903.0 \n", - "2013-06-30 00:00:00+03:00 0.000000 2.252893e+05 24.0 \n", - "2013-03-31 00:00:00+02:00 4.392317 1.328713e+04 325.0 \n", - "2013-04-30 00:00:00+03:00 0.000000 1.238429e+05 2070.0 \n", - "2013-05-31 00:00:00+03:00 0.000000 1.228235e+05 3113.0 \n", + " entropy n_transitions variance log_variance \\\n", + "2013-03-31 00:00:00+02:00 4.839451 25.0 0.003146 -5.761688 \n", + "2013-04-30 00:00:00+03:00 6.332559 97.0 0.237133 -1.439133 \n", + "2013-05-31 00:00:00+03:00 6.834065 69.0 8.288687 2.114892 \n", + "2013-06-30 00:00:00+03:00 0.000000 0.0 0.014991 -4.200287 \n", + "2013-03-31 00:00:00+02:00 1.791759 4.0 0.000004 -12.520989 \n", + "2013-04-30 00:00:00+03:00 4.143135 26.0 0.000027 -10.510017 \n", + "2013-05-31 00:00:00+03:00 0.000000 0.0 0.000012 -11.364454 \n", "\n", - " speed_average speed_variance speed_max variance \\\n", - "2013-03-31 00:00:00+02:00 0.033496 0.044885 1.750000 0.003146 \n", - "2013-04-30 00:00:00+03:00 0.269932 6.129277 33.250000 0.237133 \n", - "2013-05-31 00:00:00+03:00 0.351280 7.590639 34.000000 8.288687 \n", - "2013-06-30 00:00:00+03:00 0.044126 0.021490 0.559017 0.014991 \n", - "2013-03-31 00:00:00+02:00 0.056290 0.073370 2.692582 0.000004 \n", - "2013-04-30 00:00:00+03:00 0.066961 0.629393 32.750000 0.000027 \n", - "2013-05-31 00:00:00+03:00 0.026392 0.261978 20.250000 0.000012 \n", + " dist_total speed_average speed_max n_bins \\\n", + "2013-03-31 00:00:00+02:00 4.132581e+05 1.116127 17.284037 288.0 \n", + "2013-04-30 00:00:00+03:00 2.179693e+06 0.821680 33.831053 2032.0 \n", + "2013-05-31 00:00:00+03:00 6.986551e+06 0.847341 42.507751 1903.0 \n", + "2013-06-30 00:00:00+03:00 2.252893e+05 14.601880 43.321397 24.0 \n", + "2013-03-31 00:00:00+02:00 1.328713e+04 0.029994 0.744393 325.0 \n", + "2013-04-30 00:00:00+03:00 1.238429e+05 0.050416 16.992157 2070.0 \n", + "2013-05-31 00:00:00+03:00 1.228235e+05 0.044657 9.967899 3113.0 \n", "\n", - " log_variance \n", - "2013-03-31 00:00:00+02:00 -5.761688 \n", - "2013-04-30 00:00:00+03:00 -1.439133 \n", - "2013-05-31 00:00:00+03:00 2.114892 \n", - "2013-06-30 00:00:00+03:00 -4.200287 \n", - "2013-03-31 00:00:00+02:00 -12.520989 \n", - "2013-04-30 00:00:00+03:00 -10.510017 \n", - "2013-05-31 00:00:00+03:00 -11.364454 \n", + " speed_variance \n", + "2013-03-31 00:00:00+02:00 9.876405 \n", + "2013-04-30 00:00:00+03:00 9.970465 \n", + "2013-05-31 00:00:00+03:00 15.081070 \n", + "2013-06-30 00:00:00+03:00 242.791725 \n", + "2013-03-31 00:00:00+02:00 0.008880 \n", + "2013-04-30 00:00:00+03:00 0.237219 \n", + "2013-05-31 00:00:00+03:00 0.085366 \n", "\n", "[7 rows x 23 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -828,7 +871,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "d2b2e06b", "metadata": {}, "outputs": [ @@ -854,118 +897,118 @@ " \n", " \n", " user\n", + " variance\n", + " log_variance\n", " dist_total\n", - " n_bins\n", " speed_average\n", - " speed_variance\n", " speed_max\n", - " variance\n", - " log_variance\n", + " n_bins\n", + " speed_variance\n", " \n", " \n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u00\n", - " 4.132581e+05\n", - " 288.0\n", - " 0.033496\n", - " 0.044885\n", - " 1.750000\n", " 0.003146\n", " -5.761688\n", + " 4.132581e+05\n", + " 1.116127\n", + " 17.284037\n", + " 288.0\n", + " 9.876405\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u00\n", - " 2.179693e+06\n", - " 2032.0\n", - " 0.269932\n", - " 6.129277\n", - " 33.250000\n", " 0.237133\n", " -1.439133\n", + " 2.179693e+06\n", + " 0.821680\n", + " 33.831053\n", + " 2032.0\n", + " 9.970465\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u00\n", - " 6.986551e+06\n", - " 1903.0\n", - " 0.351280\n", - " 7.590639\n", - " 34.000000\n", " 8.288687\n", " 2.114892\n", + " 6.986551e+06\n", + " 0.847341\n", + " 42.507751\n", + " 1903.0\n", + " 15.081070\n", " \n", " \n", " 2013-06-30 00:00:00+03:00\n", " gps_u00\n", - " 2.252893e+05\n", - " 24.0\n", - " 0.044126\n", - " 0.021490\n", - " 0.559017\n", " 0.014991\n", " -4.200287\n", + " 2.252893e+05\n", + " 14.601880\n", + " 43.321397\n", + " 24.0\n", + " 242.791725\n", " \n", " \n", " 2013-03-31 00:00:00+02:00\n", " gps_u01\n", - " 1.328713e+04\n", - " 325.0\n", - " 0.056290\n", - " 0.073370\n", - " 2.692582\n", " 0.000004\n", " -12.520989\n", + " 1.328713e+04\n", + " 0.029994\n", + " 0.744393\n", + " 325.0\n", + " 0.008880\n", " \n", " \n", " 2013-04-30 00:00:00+03:00\n", " gps_u01\n", - " 1.238429e+05\n", - " 2070.0\n", - " 0.066961\n", - " 0.629393\n", - " 32.750000\n", " 0.000027\n", " -10.510017\n", + " 1.238429e+05\n", + " 0.050416\n", + " 16.992157\n", + " 2070.0\n", + " 0.237219\n", " \n", " \n", " 2013-05-31 00:00:00+03:00\n", " gps_u01\n", - " 1.228235e+05\n", - " 3113.0\n", - " 0.026392\n", - " 0.261978\n", - " 20.250000\n", " 0.000012\n", " -11.364454\n", + " 1.228235e+05\n", + " 0.044657\n", + " 9.967899\n", + " 3113.0\n", + " 0.085366\n", " \n", " \n", "\n", "" ], "text/plain": [ - " user dist_total n_bins speed_average \\\n", - "2013-03-31 00:00:00+02:00 gps_u00 4.132581e+05 288.0 0.033496 \n", - "2013-04-30 00:00:00+03:00 gps_u00 2.179693e+06 2032.0 0.269932 \n", - "2013-05-31 00:00:00+03:00 gps_u00 6.986551e+06 1903.0 0.351280 \n", - "2013-06-30 00:00:00+03:00 gps_u00 2.252893e+05 24.0 0.044126 \n", - "2013-03-31 00:00:00+02:00 gps_u01 1.328713e+04 325.0 0.056290 \n", - "2013-04-30 00:00:00+03:00 gps_u01 1.238429e+05 2070.0 0.066961 \n", - "2013-05-31 00:00:00+03:00 gps_u01 1.228235e+05 3113.0 0.026392 \n", + " user variance log_variance dist_total \\\n", + "2013-03-31 00:00:00+02:00 gps_u00 0.003146 -5.761688 4.132581e+05 \n", + "2013-04-30 00:00:00+03:00 gps_u00 0.237133 -1.439133 2.179693e+06 \n", + "2013-05-31 00:00:00+03:00 gps_u00 8.288687 2.114892 6.986551e+06 \n", + "2013-06-30 00:00:00+03:00 gps_u00 0.014991 -4.200287 2.252893e+05 \n", + "2013-03-31 00:00:00+02:00 gps_u01 0.000004 -12.520989 1.328713e+04 \n", + "2013-04-30 00:00:00+03:00 gps_u01 0.000027 -10.510017 1.238429e+05 \n", + "2013-05-31 00:00:00+03:00 gps_u01 0.000012 -11.364454 1.228235e+05 \n", "\n", - " speed_variance speed_max variance log_variance \n", - "2013-03-31 00:00:00+02:00 0.044885 1.750000 0.003146 -5.761688 \n", - "2013-04-30 00:00:00+03:00 6.129277 33.250000 0.237133 -1.439133 \n", - "2013-05-31 00:00:00+03:00 7.590639 34.000000 8.288687 2.114892 \n", - "2013-06-30 00:00:00+03:00 0.021490 0.559017 0.014991 -4.200287 \n", - "2013-03-31 00:00:00+02:00 0.073370 2.692582 0.000004 -12.520989 \n", - "2013-04-30 00:00:00+03:00 0.629393 32.750000 0.000027 -10.510017 \n", - "2013-05-31 00:00:00+03:00 0.261978 20.250000 0.000012 -11.364454 " + " speed_average speed_max n_bins speed_variance \n", + "2013-03-31 00:00:00+02:00 1.116127 17.284037 288.0 9.876405 \n", + "2013-04-30 00:00:00+03:00 0.821680 33.831053 2032.0 9.970465 \n", + "2013-05-31 00:00:00+03:00 0.847341 42.507751 1903.0 15.081070 \n", + "2013-06-30 00:00:00+03:00 14.601880 43.321397 24.0 242.791725 \n", + "2013-03-31 00:00:00+02:00 0.029994 0.744393 325.0 0.008880 \n", + "2013-04-30 00:00:00+03:00 0.050416 16.992157 2070.0 0.237219 \n", + "2013-05-31 00:00:00+03:00 0.044657 9.967899 3113.0 0.085366 " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1010,19 +1053,19 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "e602497e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{: {'resample_args': {'rule': '1M'}},\n", - " : {'resample_args': {'rule': '1M'}},\n", - " : {'resample_args': {'rule': '1M'}}}" + "{: {'resample_args': {'rule': '1ME'}},\n", + " : {'resample_args': {'rule': '1ME'}},\n", + " : {'resample_args': {'rule': '1ME'}}}" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1033,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "6a042466", "metadata": {}, "outputs": [ @@ -1060,9 +1103,9 @@ " \n", " user\n", " time\n", - " double_latitude\n", - " double_longitude\n", - " double_speed\n", + " latitude\n", + " longitude\n", + " speed\n", " \n", " \n", " \n", @@ -1111,22 +1154,15 @@ "" ], "text/plain": [ - " user time double_latitude \\\n", - "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 \n", - "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 \n", - "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 \n", - "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 \n", - "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 \n", - "\n", - " double_longitude double_speed \n", - "2013-03-27 06:00:00+02:00 -72.329240 0.0 \n", - "2013-03-27 06:20:00+02:00 -72.329018 0.0 \n", - "2013-03-27 06:40:00+02:00 -72.329238 0.0 \n", - "2013-03-27 07:00:00+02:00 -72.329240 0.0 \n", - "2013-03-27 07:20:00+02:00 -72.329240 0.0 " + " user time latitude longitude speed\n", + "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 -72.329240 0.0\n", + "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 -72.329018 0.0\n", + "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 -72.329238 0.0\n", + "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 -72.329240 0.0\n", + "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 -72.329240 0.0" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1146,7 +1182,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "81d9d0d1", "metadata": {}, "outputs": [ @@ -1172,7 +1208,7 @@ " \n", " \n", " user\n", - " double_speed\n", + " speed\n", " \n", " \n", " \n", @@ -1191,12 +1227,12 @@ "" ], "text/plain": [ - " user double_speed\n", - "0 gps_u00 34.00\n", - "1 gps_u01 32.75" + " user speed\n", + "0 gps_u00 34.00\n", + "1 gps_u01 32.75" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1205,7 +1241,7 @@ "# customized function\n", "def max_speed(df, feature_arg):\n", " grouped = df.groupby('user')\n", - " df = grouped['double_speed'].max().reset_index('user')\n", + " df = grouped['speed'].max().reset_index('user')\n", " return df\n", "\n", "customized_features = nilo.extract_features_location(\n", diff --git a/niimpy/preprocessing/location.py b/niimpy/preprocessing/location.py index 2ec7249f..8439374b 100644 --- a/niimpy/preprocessing/location.py +++ b/niimpy/preprocessing/location.py @@ -65,8 +65,8 @@ def filter_location(location, remove_disabled=True, remove_zeros=True, remove_network=False, - latitude_column = "double_latitude", - longitude_column = "double_longitude", + latitude_column = "latitude", + longitude_column = "longitude", label_column = "label", provider_column = "provider", ): @@ -268,8 +268,8 @@ def location_number_of_significant_places(df, config=None): config = {} assert isinstance(config, dict), "config is not a dictionary" - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_longitude") + latitude_column = config.get("latitude_column", "latitude") + longitude_column = config.get("longitude_column", "longitude") config["resample_args"] = config.get("resample_args", {"rule": default_freq}) def compute_features(df): @@ -335,8 +335,8 @@ def location_significant_place_features(df, config=None): config: A dictionary of optional arguments Optional arguments in config: - longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'double_longitude'. - latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'double_latitude'. + longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'longitude'. + latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'latitude'. speed_column: The name of the column with speed data in a floating point format. Defaults to 'double_speed'. resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}. """ @@ -345,8 +345,8 @@ def location_significant_place_features(df, config=None): config = {} assert isinstance(config, dict), "config is not a dictionary" - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_latitude") + latitude_column = config.get("latitude_column", "latitude") + longitude_column = config.get("longitude_column", "latitude") speed_column = config.get("speed_column", "double_speed") speed_threshold = config.get("speed_threshold", 0.277) config["resample_args"] = config.get("resample_args", {"rule": default_freq}) @@ -436,8 +436,8 @@ def location_distance_features(df, config=None): config: A dictionary of optional arguments Optional arguments in config: - longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'double_longitude'. - latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'double_latitude'. + longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'longitude'. + latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'latitude'. speed_column: The name of the column with speed data in a floating point format. Defaults to 'double_speed'. resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}. """ @@ -446,8 +446,8 @@ def location_distance_features(df, config=None): config = {} assert isinstance(config, dict), "config is not a dictionary" - latitude_column = config.get("latitude_column", "double_latitude") - longitude_column = config.get("longitude_column", "double_latitude") + latitude_column = config.get("latitude_column", "latitude") + longitude_column = config.get("longitude_column", "latitude") speed_column = config.get("speed_column", "double_speed") config["resample_args"] = config.get("resample_args", {"rule": default_freq}) @@ -505,7 +505,7 @@ def extract_features_location(df, features=None): ---------- df : pd.DataFrame dataframe of location data. It must contain these columns: - `double_latitude`, `double_longitude`, `user`, `group`. + `latitude`, `longitude`, `user`, `group`. `double_speed` is optional. If not provided, it will be computed manually. speed_threshold : float