diff --git a/README.md b/README.md
index d859d5da..ee8dd328 100644
--- a/README.md
+++ b/README.md
@@ -89,8 +89,8 @@ location = location.reset_index(0).dropna()
# Feature extraction
features = nilo.extract_features(
- lats=location['double_latitude'],
- lons=location['double_longitude'],
+ lats=location['latitude'],
+ lons=location['longitude'],
users=location['user'],
groups=location['group'],
times=location.index,
diff --git a/docs/user_guide/preprocessing/location.ipynb b/docs/user_guide/preprocessing/location.ipynb
index 46ff53ac..8339c898 100644
--- a/docs/user_guide/preprocessing/location.ipynb
+++ b/docs/user_guide/preprocessing/location.ipynb
@@ -22,8 +22,8 @@
"Location data is expected to have the following columns (column names can be different, but in that case they must be provided as parameters):\n",
"- `user`: Subject ID\n",
"- `device`: Device ID\n",
- "- `double_latitude`: Latitude as a floating point number\n",
- "- `double_longitude`: Longitude as a floating point number\n",
+ "- `latitude`: Latitude as a floating point number\n",
+ "- `longitude`: Longitude as a floating point number\n",
"\n",
"Optional columns include:\n",
"- `double_speed`: Speed measured at the location\n",
@@ -208,7 +208,120 @@
"id": "86e7396c",
"metadata": {},
"source": [
- "The necessary columns for further analysis are `double_latitude`, `double_longitude`, `double_speed`, and `user`. `user` refers to a unique identifier for a subject."
+ "For further analysis we need a `latitude`, `longitude`, `speed`, and `user` column. `user` refers to a unique identifier for a subject.\n",
+ "\n",
+ "These columsn exist in the data, but some column names are different. We could provide these column names as arguments, but it is easier to rename them here:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "df3a2a0a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " time | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " speed | \n",
+ " user | \n",
+ " datetime | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2013-03-27 06:03:29+02:00 | \n",
+ " 1364357009 | \n",
+ " 43.706667 | \n",
+ " -72.289097 | \n",
+ " 0.00 | \n",
+ " gps_u01 | \n",
+ " 2013-03-27 06:03:29+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2013-03-27 06:23:29+02:00 | \n",
+ " 1364358209 | \n",
+ " 43.706637 | \n",
+ " -72.289066 | \n",
+ " 0.00 | \n",
+ " gps_u01 | \n",
+ " 2013-03-27 06:23:29+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2013-03-27 06:43:25+02:00 | \n",
+ " 1364359405 | \n",
+ " 43.706678 | \n",
+ " -72.289018 | \n",
+ " 0.25 | \n",
+ " gps_u01 | \n",
+ " 2013-03-27 06:43:25+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2013-03-27 07:03:29+02:00 | \n",
+ " 1364360609 | \n",
+ " 43.706665 | \n",
+ " -72.289087 | \n",
+ " 0.00 | \n",
+ " gps_u01 | \n",
+ " 2013-03-27 07:03:29+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2013-03-27 07:23:25+02:00 | \n",
+ " 1364361805 | \n",
+ " 43.706808 | \n",
+ " -72.289370 | \n",
+ " 0.00 | \n",
+ " gps_u01 | \n",
+ " 2013-03-27 07:23:25+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " time latitude longitude speed user \\\n",
+ "2013-03-27 06:03:29+02:00 1364357009 43.706667 -72.289097 0.00 gps_u01 \n",
+ "2013-03-27 06:23:29+02:00 1364358209 43.706637 -72.289066 0.00 gps_u01 \n",
+ "2013-03-27 06:43:25+02:00 1364359405 43.706678 -72.289018 0.25 gps_u01 \n",
+ "2013-03-27 07:03:29+02:00 1364360609 43.706665 -72.289087 0.00 gps_u01 \n",
+ "2013-03-27 07:23:25+02:00 1364361805 43.706808 -72.289370 0.00 gps_u01 \n",
+ "\n",
+ " datetime \n",
+ "2013-03-27 06:03:29+02:00 2013-03-27 06:03:29+02:00 \n",
+ "2013-03-27 06:23:29+02:00 2013-03-27 06:23:29+02:00 \n",
+ "2013-03-27 06:43:25+02:00 2013-03-27 06:43:25+02:00 \n",
+ "2013-03-27 07:03:29+02:00 2013-03-27 07:03:29+02:00 \n",
+ "2013-03-27 07:23:25+02:00 2013-03-27 07:23:25+02:00 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = data.rename(columns={\"double_latitude\": \"latitude\", \"double_longitude\": \"longitude\", \"double_speed\": \"speed\"})\n",
+ "data.head()"
]
},
{
@@ -235,7 +348,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"id": "a96bdaa6",
"metadata": {},
"outputs": [
@@ -245,7 +358,7 @@
"(9857, 6)"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -290,7 +403,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"id": "01aefd90",
"metadata": {},
"outputs": [
@@ -300,20 +413,20 @@
"(9755, 5)"
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"binned_data = niimpy.util.aggregate(data, freq='5min', method_numerical='median')\n",
- "binned_data = binned_data.reset_index(0).dropna()\n",
+ "binned_data = binned_data.dropna()\n",
"binned_data.shape"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "d7027bec",
"metadata": {},
"outputs": [
@@ -340,9 +453,9 @@
" | \n",
" user | \n",
" time | \n",
- " double_latitude | \n",
- " double_longitude | \n",
- " double_speed | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " speed | \n",
" \n",
" \n",
" \n",
@@ -386,96 +499,26 @@
" -72.329240 | \n",
" 0.0 | \n",
" \n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 2013-05-29 16:10:00+03:00 | \n",
- " gps_u01 | \n",
- " 1.369833e+09 | \n",
- " 43.706711 | \n",
- " -72.289205 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2013-05-29 16:20:00+03:00 | \n",
- " gps_u01 | \n",
- " 1.369834e+09 | \n",
- " 43.706708 | \n",
- " -72.289162 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2013-05-29 16:30:00+03:00 | \n",
- " gps_u01 | \n",
- " 1.369834e+09 | \n",
- " 43.706725 | \n",
- " -72.289149 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2013-05-29 16:40:00+03:00 | \n",
- " gps_u01 | \n",
- " 1.369835e+09 | \n",
- " 43.706697 | \n",
- " -72.289165 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2013-05-29 16:50:00+03:00 | \n",
- " gps_u01 | \n",
- " 1.369836e+09 | \n",
- " 43.706713 | \n",
- " -72.289191 | \n",
- " 0.0 | \n",
- "
\n",
" \n",
"\n",
- "9755 rows × 5 columns
\n",
""
],
"text/plain": [
- " user time double_latitude \\\n",
- "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 \n",
- "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 \n",
- "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 \n",
- "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 \n",
- "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 \n",
- "... ... ... ... \n",
- "2013-05-29 16:10:00+03:00 gps_u01 1.369833e+09 43.706711 \n",
- "2013-05-29 16:20:00+03:00 gps_u01 1.369834e+09 43.706708 \n",
- "2013-05-29 16:30:00+03:00 gps_u01 1.369834e+09 43.706725 \n",
- "2013-05-29 16:40:00+03:00 gps_u01 1.369835e+09 43.706697 \n",
- "2013-05-29 16:50:00+03:00 gps_u01 1.369836e+09 43.706713 \n",
- "\n",
- " double_longitude double_speed \n",
- "2013-03-27 06:00:00+02:00 -72.329240 0.0 \n",
- "2013-03-27 06:20:00+02:00 -72.329018 0.0 \n",
- "2013-03-27 06:40:00+02:00 -72.329238 0.0 \n",
- "2013-03-27 07:00:00+02:00 -72.329240 0.0 \n",
- "2013-03-27 07:20:00+02:00 -72.329240 0.0 \n",
- "... ... ... \n",
- "2013-05-29 16:10:00+03:00 -72.289205 0.0 \n",
- "2013-05-29 16:20:00+03:00 -72.289162 0.0 \n",
- "2013-05-29 16:30:00+03:00 -72.289149 0.0 \n",
- "2013-05-29 16:40:00+03:00 -72.289165 0.0 \n",
- "2013-05-29 16:50:00+03:00 -72.289191 0.0 \n",
- "\n",
- "[9755 rows x 5 columns]"
+ " user time latitude longitude speed\n",
+ "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 -72.329240 0.0\n",
+ "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 -72.329018 0.0\n",
+ "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 -72.329238 0.0\n",
+ "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 -72.329240 0.0\n",
+ "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 -72.329240 0.0"
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "binned_data"
+ "binned_data.head()"
]
},
{
@@ -532,7 +575,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"id": "5bf0185c",
"metadata": {},
"outputs": [
@@ -559,25 +602,25 @@
" | \n",
" user | \n",
" n_significant_places | \n",
+ " n_home | \n",
" n_sps | \n",
- " n_static | \n",
+ " normalized_entropy | \n",
" n_moving | \n",
- " n_rare | \n",
- " n_home | \n",
+ " n_top4 | \n",
" max_dist_home | \n",
- " n_transitions | \n",
- " n_top1 | \n",
- " ... | \n",
" n_top5 | \n",
+ " n_rare | \n",
+ " ... | \n",
+ " n_top1 | \n",
" entropy | \n",
- " normalized_entropy | \n",
+ " n_transitions | \n",
+ " variance | \n",
+ " log_variance | \n",
" dist_total | \n",
- " n_bins | \n",
" speed_average | \n",
- " speed_variance | \n",
" speed_max | \n",
- " variance | \n",
- " log_variance | \n",
+ " n_bins | \n",
+ " speed_variance | \n",
" \n",
" \n",
" \n",
@@ -585,169 +628,169 @@
" 2013-03-31 00:00:00+02:00 | \n",
" gps_u00 | \n",
" 6 | \n",
+ " 97.0 | \n",
" 5.0 | \n",
- " 280.0 | \n",
- " 8.0 | \n",
- " 3.0 | \n",
- " 106.0 | \n",
+ " 3.006920 | \n",
+ " 54.0 | \n",
+ " 18.0 | \n",
" 2.074186e+04 | \n",
- " 48.0 | \n",
- " 106.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
" ... | \n",
- " 18.0 | \n",
- " 5.091668 | \n",
- " 3.163631 | \n",
- " 4.132581e+05 | \n",
- " 288.0 | \n",
- " 0.033496 | \n",
- " 0.044885 | \n",
- " 1.750000 | \n",
+ " 98.0 | \n",
+ " 4.839451 | \n",
+ " 25.0 | \n",
" 0.003146 | \n",
" -5.761688 | \n",
+ " 4.132581e+05 | \n",
+ " 1.116127 | \n",
+ " 17.284037 | \n",
+ " 288.0 | \n",
+ " 9.876405 | \n",
" \n",
" \n",
" 2013-04-30 00:00:00+03:00 | \n",
" gps_u00 | \n",
" 10 | \n",
- " 10.0 | \n",
- " 1966.0 | \n",
- " 66.0 | \n",
- " 45.0 | \n",
- " 1010.0 | \n",
+ " 969.0 | \n",
+ " 8.0 | \n",
+ " 3.045317 | \n",
+ " 318.0 | \n",
+ " 37.0 | \n",
" 2.914790e+05 | \n",
- " 194.0 | \n",
- " 1016.0 | \n",
+ " 18.0 | \n",
+ " 18.0 | \n",
" ... | \n",
- " 38.0 | \n",
- " 7.284903 | \n",
- " 3.163793 | \n",
- " 2.179693e+06 | \n",
- " 2032.0 | \n",
- " 0.269932 | \n",
- " 6.129277 | \n",
- " 33.250000 | \n",
+ " 976.0 | \n",
+ " 6.332559 | \n",
+ " 97.0 | \n",
" 0.237133 | \n",
" -1.439133 | \n",
+ " 2.179693e+06 | \n",
+ " 0.821680 | \n",
+ " 33.831053 | \n",
+ " 2032.0 | \n",
+ " 9.970465 | \n",
"
\n",
" \n",
" 2013-05-31 00:00:00+03:00 | \n",
" gps_u00 | \n",
" 15 | \n",
- " 12.0 | \n",
- " 1827.0 | \n",
- " 76.0 | \n",
- " 86.0 | \n",
- " 1028.0 | \n",
+ " 1007.0 | \n",
+ " 9.0 | \n",
+ " 3.110317 | \n",
+ " 255.0 | \n",
+ " 43.0 | \n",
" 1.041741e+06 | \n",
- " 107.0 | \n",
- " 1030.0 | \n",
+ " 38.0 | \n",
+ " 31.0 | \n",
" ... | \n",
- " 46.0 | \n",
- " 6.701177 | \n",
- " 2.696752 | \n",
- " 6.986551e+06 | \n",
- " 1903.0 | \n",
- " 0.351280 | \n",
- " 7.590639 | \n",
- " 34.000000 | \n",
+ " 1009.0 | \n",
+ " 6.834065 | \n",
+ " 69.0 | \n",
" 8.288687 | \n",
" 2.114892 | \n",
+ " 6.986551e+06 | \n",
+ " 0.847341 | \n",
+ " 42.507751 | \n",
+ " 1903.0 | \n",
+ " 15.081070 | \n",
"
\n",
" \n",
" 2013-06-30 00:00:00+03:00 | \n",
" gps_u00 | \n",
" 1 | \n",
- " 1.0 | \n",
- " 22.0 | \n",
- " 2.0 | \n",
- " 15.0 | \n",
" 0.0 | \n",
- " 2.035837e+04 | \n",
- " 10.0 | \n",
- " 15.0 | \n",
- " ... | \n",
" 0.0 | \n",
" 0.000000 | \n",
+ " 18.0 | \n",
+ " 0.0 | \n",
+ " 1.989381e+04 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " ... | \n",
+ " 6.0 | \n",
" 0.000000 | \n",
- " 2.252893e+05 | \n",
- " 24.0 | \n",
- " 0.044126 | \n",
- " 0.021490 | \n",
- " 0.559017 | \n",
+ " 0.0 | \n",
" 0.014991 | \n",
" -4.200287 | \n",
+ " 2.252893e+05 | \n",
+ " 14.601880 | \n",
+ " 43.321397 | \n",
+ " 24.0 | \n",
+ " 242.791725 | \n",
"
\n",
" \n",
" 2013-03-31 00:00:00+02:00 | \n",
" gps_u01 | \n",
" 4 | \n",
+ " 273.0 | \n",
" 2.0 | \n",
- " 307.0 | \n",
- " 18.0 | \n",
+ " 2.584963 | \n",
+ " 12.0 | \n",
" 0.0 | \n",
- " 260.0 | \n",
" 6.975303e+02 | \n",
- " 8.0 | \n",
- " 286.0 | \n",
- " ... | \n",
" 0.0 | \n",
- " 3.044522 | \n",
- " 4.392317 | \n",
- " 1.328713e+04 | \n",
- " 325.0 | \n",
- " 0.056290 | \n",
- " 0.073370 | \n",
- " 2.692582 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 307.0 | \n",
+ " 1.791759 | \n",
+ " 4.0 | \n",
" 0.000004 | \n",
" -12.520989 | \n",
+ " 1.328713e+04 | \n",
+ " 0.029994 | \n",
+ " 0.744393 | \n",
+ " 325.0 | \n",
+ " 0.008880 | \n",
"
\n",
" \n",
" 2013-04-30 00:00:00+03:00 | \n",
" gps_u01 | \n",
" 4 | \n",
- " 1.0 | \n",
- " 1999.0 | \n",
- " 71.0 | \n",
- " 1.0 | \n",
- " 1500.0 | \n",
- " 1.156568e+04 | \n",
+ " 1492.0 | \n",
" 2.0 | \n",
- " 1998.0 | \n",
- " ... | \n",
+ " 5.977280 | \n",
+ " 78.0 | \n",
" 0.0 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 1.238429e+05 | \n",
- " 2070.0 | \n",
- " 0.066961 | \n",
- " 0.629393 | \n",
- " 32.750000 | \n",
+ " 1.156568e+04 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 1928.0 | \n",
+ " 4.143135 | \n",
+ " 26.0 | \n",
" 0.000027 | \n",
" -10.510017 | \n",
+ " 1.238429e+05 | \n",
+ " 0.050416 | \n",
+ " 16.992157 | \n",
+ " 2070.0 | \n",
+ " 0.237219 | \n",
"
\n",
" \n",
" 2013-05-31 00:00:00+03:00 | \n",
" gps_u01 | \n",
" 2 | \n",
+ " 42.0 | \n",
" 1.0 | \n",
- " 3079.0 | \n",
- " 34.0 | \n",
- " 1.0 | \n",
- " 45.0 | \n",
- " 3.957650e+03 | \n",
- " 2.0 | \n",
- " 3078.0 | \n",
- " ... | \n",
- " 0.0 | \n",
" 0.000000 | \n",
+ " 110.0 | \n",
+ " 0.0 | \n",
+ " 6.771047e+02 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 3003.0 | \n",
" 0.000000 | \n",
- " 1.228235e+05 | \n",
- " 3113.0 | \n",
- " 0.026392 | \n",
- " 0.261978 | \n",
- " 20.250000 | \n",
+ " 0.0 | \n",
" 0.000012 | \n",
" -11.364454 | \n",
+ " 1.228235e+05 | \n",
+ " 0.044657 | \n",
+ " 9.967899 | \n",
+ " 3113.0 | \n",
+ " 0.085366 | \n",
"
\n",
" \n",
"\n",
@@ -755,64 +798,64 @@
""
],
"text/plain": [
- " user n_significant_places n_sps n_static \\\n",
- "2013-03-31 00:00:00+02:00 gps_u00 6 5.0 280.0 \n",
- "2013-04-30 00:00:00+03:00 gps_u00 10 10.0 1966.0 \n",
- "2013-05-31 00:00:00+03:00 gps_u00 15 12.0 1827.0 \n",
- "2013-06-30 00:00:00+03:00 gps_u00 1 1.0 22.0 \n",
- "2013-03-31 00:00:00+02:00 gps_u01 4 2.0 307.0 \n",
- "2013-04-30 00:00:00+03:00 gps_u01 4 1.0 1999.0 \n",
- "2013-05-31 00:00:00+03:00 gps_u01 2 1.0 3079.0 \n",
+ " user n_significant_places n_home n_sps \\\n",
+ "2013-03-31 00:00:00+02:00 gps_u00 6 97.0 5.0 \n",
+ "2013-04-30 00:00:00+03:00 gps_u00 10 969.0 8.0 \n",
+ "2013-05-31 00:00:00+03:00 gps_u00 15 1007.0 9.0 \n",
+ "2013-06-30 00:00:00+03:00 gps_u00 1 0.0 0.0 \n",
+ "2013-03-31 00:00:00+02:00 gps_u01 4 273.0 2.0 \n",
+ "2013-04-30 00:00:00+03:00 gps_u01 4 1492.0 2.0 \n",
+ "2013-05-31 00:00:00+03:00 gps_u01 2 42.0 1.0 \n",
"\n",
- " n_moving n_rare n_home max_dist_home \\\n",
- "2013-03-31 00:00:00+02:00 8.0 3.0 106.0 2.074186e+04 \n",
- "2013-04-30 00:00:00+03:00 66.0 45.0 1010.0 2.914790e+05 \n",
- "2013-05-31 00:00:00+03:00 76.0 86.0 1028.0 1.041741e+06 \n",
- "2013-06-30 00:00:00+03:00 2.0 15.0 0.0 2.035837e+04 \n",
- "2013-03-31 00:00:00+02:00 18.0 0.0 260.0 6.975303e+02 \n",
- "2013-04-30 00:00:00+03:00 71.0 1.0 1500.0 1.156568e+04 \n",
- "2013-05-31 00:00:00+03:00 34.0 1.0 45.0 3.957650e+03 \n",
+ " normalized_entropy n_moving n_top4 \\\n",
+ "2013-03-31 00:00:00+02:00 3.006920 54.0 18.0 \n",
+ "2013-04-30 00:00:00+03:00 3.045317 318.0 37.0 \n",
+ "2013-05-31 00:00:00+03:00 3.110317 255.0 43.0 \n",
+ "2013-06-30 00:00:00+03:00 0.000000 18.0 0.0 \n",
+ "2013-03-31 00:00:00+02:00 2.584963 12.0 0.0 \n",
+ "2013-04-30 00:00:00+03:00 5.977280 78.0 0.0 \n",
+ "2013-05-31 00:00:00+03:00 0.000000 110.0 0.0 \n",
"\n",
- " n_transitions n_top1 ... n_top5 entropy \\\n",
- "2013-03-31 00:00:00+02:00 48.0 106.0 ... 18.0 5.091668 \n",
- "2013-04-30 00:00:00+03:00 194.0 1016.0 ... 38.0 7.284903 \n",
- "2013-05-31 00:00:00+03:00 107.0 1030.0 ... 46.0 6.701177 \n",
- "2013-06-30 00:00:00+03:00 10.0 15.0 ... 0.0 0.000000 \n",
- "2013-03-31 00:00:00+02:00 8.0 286.0 ... 0.0 3.044522 \n",
- "2013-04-30 00:00:00+03:00 2.0 1998.0 ... 0.0 0.000000 \n",
- "2013-05-31 00:00:00+03:00 2.0 3078.0 ... 0.0 0.000000 \n",
+ " max_dist_home n_top5 n_rare ... n_top1 \\\n",
+ "2013-03-31 00:00:00+02:00 2.074186e+04 8.0 0.0 ... 98.0 \n",
+ "2013-04-30 00:00:00+03:00 2.914790e+05 18.0 18.0 ... 976.0 \n",
+ "2013-05-31 00:00:00+03:00 1.041741e+06 38.0 31.0 ... 1009.0 \n",
+ "2013-06-30 00:00:00+03:00 1.989381e+04 0.0 6.0 ... 6.0 \n",
+ "2013-03-31 00:00:00+02:00 6.975303e+02 0.0 0.0 ... 307.0 \n",
+ "2013-04-30 00:00:00+03:00 1.156568e+04 0.0 1.0 ... 1928.0 \n",
+ "2013-05-31 00:00:00+03:00 6.771047e+02 0.0 0.0 ... 3003.0 \n",
"\n",
- " normalized_entropy dist_total n_bins \\\n",
- "2013-03-31 00:00:00+02:00 3.163631 4.132581e+05 288.0 \n",
- "2013-04-30 00:00:00+03:00 3.163793 2.179693e+06 2032.0 \n",
- "2013-05-31 00:00:00+03:00 2.696752 6.986551e+06 1903.0 \n",
- "2013-06-30 00:00:00+03:00 0.000000 2.252893e+05 24.0 \n",
- "2013-03-31 00:00:00+02:00 4.392317 1.328713e+04 325.0 \n",
- "2013-04-30 00:00:00+03:00 0.000000 1.238429e+05 2070.0 \n",
- "2013-05-31 00:00:00+03:00 0.000000 1.228235e+05 3113.0 \n",
+ " entropy n_transitions variance log_variance \\\n",
+ "2013-03-31 00:00:00+02:00 4.839451 25.0 0.003146 -5.761688 \n",
+ "2013-04-30 00:00:00+03:00 6.332559 97.0 0.237133 -1.439133 \n",
+ "2013-05-31 00:00:00+03:00 6.834065 69.0 8.288687 2.114892 \n",
+ "2013-06-30 00:00:00+03:00 0.000000 0.0 0.014991 -4.200287 \n",
+ "2013-03-31 00:00:00+02:00 1.791759 4.0 0.000004 -12.520989 \n",
+ "2013-04-30 00:00:00+03:00 4.143135 26.0 0.000027 -10.510017 \n",
+ "2013-05-31 00:00:00+03:00 0.000000 0.0 0.000012 -11.364454 \n",
"\n",
- " speed_average speed_variance speed_max variance \\\n",
- "2013-03-31 00:00:00+02:00 0.033496 0.044885 1.750000 0.003146 \n",
- "2013-04-30 00:00:00+03:00 0.269932 6.129277 33.250000 0.237133 \n",
- "2013-05-31 00:00:00+03:00 0.351280 7.590639 34.000000 8.288687 \n",
- "2013-06-30 00:00:00+03:00 0.044126 0.021490 0.559017 0.014991 \n",
- "2013-03-31 00:00:00+02:00 0.056290 0.073370 2.692582 0.000004 \n",
- "2013-04-30 00:00:00+03:00 0.066961 0.629393 32.750000 0.000027 \n",
- "2013-05-31 00:00:00+03:00 0.026392 0.261978 20.250000 0.000012 \n",
+ " dist_total speed_average speed_max n_bins \\\n",
+ "2013-03-31 00:00:00+02:00 4.132581e+05 1.116127 17.284037 288.0 \n",
+ "2013-04-30 00:00:00+03:00 2.179693e+06 0.821680 33.831053 2032.0 \n",
+ "2013-05-31 00:00:00+03:00 6.986551e+06 0.847341 42.507751 1903.0 \n",
+ "2013-06-30 00:00:00+03:00 2.252893e+05 14.601880 43.321397 24.0 \n",
+ "2013-03-31 00:00:00+02:00 1.328713e+04 0.029994 0.744393 325.0 \n",
+ "2013-04-30 00:00:00+03:00 1.238429e+05 0.050416 16.992157 2070.0 \n",
+ "2013-05-31 00:00:00+03:00 1.228235e+05 0.044657 9.967899 3113.0 \n",
"\n",
- " log_variance \n",
- "2013-03-31 00:00:00+02:00 -5.761688 \n",
- "2013-04-30 00:00:00+03:00 -1.439133 \n",
- "2013-05-31 00:00:00+03:00 2.114892 \n",
- "2013-06-30 00:00:00+03:00 -4.200287 \n",
- "2013-03-31 00:00:00+02:00 -12.520989 \n",
- "2013-04-30 00:00:00+03:00 -10.510017 \n",
- "2013-05-31 00:00:00+03:00 -11.364454 \n",
+ " speed_variance \n",
+ "2013-03-31 00:00:00+02:00 9.876405 \n",
+ "2013-04-30 00:00:00+03:00 9.970465 \n",
+ "2013-05-31 00:00:00+03:00 15.081070 \n",
+ "2013-06-30 00:00:00+03:00 242.791725 \n",
+ "2013-03-31 00:00:00+02:00 0.008880 \n",
+ "2013-04-30 00:00:00+03:00 0.237219 \n",
+ "2013-05-31 00:00:00+03:00 0.085366 \n",
"\n",
"[7 rows x 23 columns]"
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -828,7 +871,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"id": "d2b2e06b",
"metadata": {},
"outputs": [
@@ -854,118 +897,118 @@
" \n",
" | \n",
" user | \n",
+ " variance | \n",
+ " log_variance | \n",
" dist_total | \n",
- " n_bins | \n",
" speed_average | \n",
- " speed_variance | \n",
" speed_max | \n",
- " variance | \n",
- " log_variance | \n",
+ " n_bins | \n",
+ " speed_variance | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-03-31 00:00:00+02:00 | \n",
" gps_u00 | \n",
- " 4.132581e+05 | \n",
- " 288.0 | \n",
- " 0.033496 | \n",
- " 0.044885 | \n",
- " 1.750000 | \n",
" 0.003146 | \n",
" -5.761688 | \n",
+ " 4.132581e+05 | \n",
+ " 1.116127 | \n",
+ " 17.284037 | \n",
+ " 288.0 | \n",
+ " 9.876405 | \n",
"
\n",
" \n",
" 2013-04-30 00:00:00+03:00 | \n",
" gps_u00 | \n",
- " 2.179693e+06 | \n",
- " 2032.0 | \n",
- " 0.269932 | \n",
- " 6.129277 | \n",
- " 33.250000 | \n",
" 0.237133 | \n",
" -1.439133 | \n",
+ " 2.179693e+06 | \n",
+ " 0.821680 | \n",
+ " 33.831053 | \n",
+ " 2032.0 | \n",
+ " 9.970465 | \n",
"
\n",
" \n",
" 2013-05-31 00:00:00+03:00 | \n",
" gps_u00 | \n",
- " 6.986551e+06 | \n",
- " 1903.0 | \n",
- " 0.351280 | \n",
- " 7.590639 | \n",
- " 34.000000 | \n",
" 8.288687 | \n",
" 2.114892 | \n",
+ " 6.986551e+06 | \n",
+ " 0.847341 | \n",
+ " 42.507751 | \n",
+ " 1903.0 | \n",
+ " 15.081070 | \n",
"
\n",
" \n",
" 2013-06-30 00:00:00+03:00 | \n",
" gps_u00 | \n",
- " 2.252893e+05 | \n",
- " 24.0 | \n",
- " 0.044126 | \n",
- " 0.021490 | \n",
- " 0.559017 | \n",
" 0.014991 | \n",
" -4.200287 | \n",
+ " 2.252893e+05 | \n",
+ " 14.601880 | \n",
+ " 43.321397 | \n",
+ " 24.0 | \n",
+ " 242.791725 | \n",
"
\n",
" \n",
" 2013-03-31 00:00:00+02:00 | \n",
" gps_u01 | \n",
- " 1.328713e+04 | \n",
- " 325.0 | \n",
- " 0.056290 | \n",
- " 0.073370 | \n",
- " 2.692582 | \n",
" 0.000004 | \n",
" -12.520989 | \n",
+ " 1.328713e+04 | \n",
+ " 0.029994 | \n",
+ " 0.744393 | \n",
+ " 325.0 | \n",
+ " 0.008880 | \n",
"
\n",
" \n",
" 2013-04-30 00:00:00+03:00 | \n",
" gps_u01 | \n",
- " 1.238429e+05 | \n",
- " 2070.0 | \n",
- " 0.066961 | \n",
- " 0.629393 | \n",
- " 32.750000 | \n",
" 0.000027 | \n",
" -10.510017 | \n",
+ " 1.238429e+05 | \n",
+ " 0.050416 | \n",
+ " 16.992157 | \n",
+ " 2070.0 | \n",
+ " 0.237219 | \n",
"
\n",
" \n",
" 2013-05-31 00:00:00+03:00 | \n",
" gps_u01 | \n",
- " 1.228235e+05 | \n",
- " 3113.0 | \n",
- " 0.026392 | \n",
- " 0.261978 | \n",
- " 20.250000 | \n",
" 0.000012 | \n",
" -11.364454 | \n",
+ " 1.228235e+05 | \n",
+ " 0.044657 | \n",
+ " 9.967899 | \n",
+ " 3113.0 | \n",
+ " 0.085366 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " user dist_total n_bins speed_average \\\n",
- "2013-03-31 00:00:00+02:00 gps_u00 4.132581e+05 288.0 0.033496 \n",
- "2013-04-30 00:00:00+03:00 gps_u00 2.179693e+06 2032.0 0.269932 \n",
- "2013-05-31 00:00:00+03:00 gps_u00 6.986551e+06 1903.0 0.351280 \n",
- "2013-06-30 00:00:00+03:00 gps_u00 2.252893e+05 24.0 0.044126 \n",
- "2013-03-31 00:00:00+02:00 gps_u01 1.328713e+04 325.0 0.056290 \n",
- "2013-04-30 00:00:00+03:00 gps_u01 1.238429e+05 2070.0 0.066961 \n",
- "2013-05-31 00:00:00+03:00 gps_u01 1.228235e+05 3113.0 0.026392 \n",
+ " user variance log_variance dist_total \\\n",
+ "2013-03-31 00:00:00+02:00 gps_u00 0.003146 -5.761688 4.132581e+05 \n",
+ "2013-04-30 00:00:00+03:00 gps_u00 0.237133 -1.439133 2.179693e+06 \n",
+ "2013-05-31 00:00:00+03:00 gps_u00 8.288687 2.114892 6.986551e+06 \n",
+ "2013-06-30 00:00:00+03:00 gps_u00 0.014991 -4.200287 2.252893e+05 \n",
+ "2013-03-31 00:00:00+02:00 gps_u01 0.000004 -12.520989 1.328713e+04 \n",
+ "2013-04-30 00:00:00+03:00 gps_u01 0.000027 -10.510017 1.238429e+05 \n",
+ "2013-05-31 00:00:00+03:00 gps_u01 0.000012 -11.364454 1.228235e+05 \n",
"\n",
- " speed_variance speed_max variance log_variance \n",
- "2013-03-31 00:00:00+02:00 0.044885 1.750000 0.003146 -5.761688 \n",
- "2013-04-30 00:00:00+03:00 6.129277 33.250000 0.237133 -1.439133 \n",
- "2013-05-31 00:00:00+03:00 7.590639 34.000000 8.288687 2.114892 \n",
- "2013-06-30 00:00:00+03:00 0.021490 0.559017 0.014991 -4.200287 \n",
- "2013-03-31 00:00:00+02:00 0.073370 2.692582 0.000004 -12.520989 \n",
- "2013-04-30 00:00:00+03:00 0.629393 32.750000 0.000027 -10.510017 \n",
- "2013-05-31 00:00:00+03:00 0.261978 20.250000 0.000012 -11.364454 "
+ " speed_average speed_max n_bins speed_variance \n",
+ "2013-03-31 00:00:00+02:00 1.116127 17.284037 288.0 9.876405 \n",
+ "2013-04-30 00:00:00+03:00 0.821680 33.831053 2032.0 9.970465 \n",
+ "2013-05-31 00:00:00+03:00 0.847341 42.507751 1903.0 15.081070 \n",
+ "2013-06-30 00:00:00+03:00 14.601880 43.321397 24.0 242.791725 \n",
+ "2013-03-31 00:00:00+02:00 0.029994 0.744393 325.0 0.008880 \n",
+ "2013-04-30 00:00:00+03:00 0.050416 16.992157 2070.0 0.237219 \n",
+ "2013-05-31 00:00:00+03:00 0.044657 9.967899 3113.0 0.085366 "
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -1010,19 +1053,19 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"id": "e602497e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{: {'resample_args': {'rule': '1M'}},\n",
- " : {'resample_args': {'rule': '1M'}},\n",
- " : {'resample_args': {'rule': '1M'}}}"
+ "{: {'resample_args': {'rule': '1ME'}},\n",
+ " : {'resample_args': {'rule': '1ME'}},\n",
+ " : {'resample_args': {'rule': '1ME'}}}"
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -1033,7 +1076,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"id": "6a042466",
"metadata": {},
"outputs": [
@@ -1060,9 +1103,9 @@
" | \n",
" user | \n",
" time | \n",
- " double_latitude | \n",
- " double_longitude | \n",
- " double_speed | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " speed | \n",
" \n",
" \n",
" \n",
@@ -1111,22 +1154,15 @@
""
],
"text/plain": [
- " user time double_latitude \\\n",
- "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 \n",
- "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 \n",
- "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 \n",
- "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 \n",
- "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 \n",
- "\n",
- " double_longitude double_speed \n",
- "2013-03-27 06:00:00+02:00 -72.329240 0.0 \n",
- "2013-03-27 06:20:00+02:00 -72.329018 0.0 \n",
- "2013-03-27 06:40:00+02:00 -72.329238 0.0 \n",
- "2013-03-27 07:00:00+02:00 -72.329240 0.0 \n",
- "2013-03-27 07:20:00+02:00 -72.329240 0.0 "
+ " user time latitude longitude speed\n",
+ "2013-03-27 06:00:00+02:00 gps_u00 1.364357e+09 43.759135 -72.329240 0.0\n",
+ "2013-03-27 06:20:00+02:00 gps_u00 1.364358e+09 43.759503 -72.329018 0.0\n",
+ "2013-03-27 06:40:00+02:00 gps_u00 1.364359e+09 43.759134 -72.329238 0.0\n",
+ "2013-03-27 07:00:00+02:00 gps_u00 1.364361e+09 43.759135 -72.329240 0.0\n",
+ "2013-03-27 07:20:00+02:00 gps_u00 1.364362e+09 43.759135 -72.329240 0.0"
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -1146,7 +1182,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"id": "81d9d0d1",
"metadata": {},
"outputs": [
@@ -1172,7 +1208,7 @@
" \n",
" | \n",
" user | \n",
- " double_speed | \n",
+ " speed | \n",
"
\n",
" \n",
" \n",
@@ -1191,12 +1227,12 @@
""
],
"text/plain": [
- " user double_speed\n",
- "0 gps_u00 34.00\n",
- "1 gps_u01 32.75"
+ " user speed\n",
+ "0 gps_u00 34.00\n",
+ "1 gps_u01 32.75"
]
},
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1205,7 +1241,7 @@
"# customized function\n",
"def max_speed(df, feature_arg):\n",
" grouped = df.groupby('user')\n",
- " df = grouped['double_speed'].max().reset_index('user')\n",
+ " df = grouped['speed'].max().reset_index('user')\n",
" return df\n",
"\n",
"customized_features = nilo.extract_features_location(\n",
diff --git a/niimpy/preprocessing/location.py b/niimpy/preprocessing/location.py
index 2ec7249f..8439374b 100644
--- a/niimpy/preprocessing/location.py
+++ b/niimpy/preprocessing/location.py
@@ -65,8 +65,8 @@ def filter_location(location,
remove_disabled=True,
remove_zeros=True,
remove_network=False,
- latitude_column = "double_latitude",
- longitude_column = "double_longitude",
+ latitude_column = "latitude",
+ longitude_column = "longitude",
label_column = "label",
provider_column = "provider",
):
@@ -268,8 +268,8 @@ def location_number_of_significant_places(df, config=None):
config = {}
assert isinstance(config, dict), "config is not a dictionary"
- latitude_column = config.get("latitude_column", "double_latitude")
- longitude_column = config.get("longitude_column", "double_longitude")
+ latitude_column = config.get("latitude_column", "latitude")
+ longitude_column = config.get("longitude_column", "longitude")
config["resample_args"] = config.get("resample_args", {"rule": default_freq})
def compute_features(df):
@@ -335,8 +335,8 @@ def location_significant_place_features(df, config=None):
config: A dictionary of optional arguments
Optional arguments in config:
- longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'double_longitude'.
- latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'double_latitude'.
+ longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'longitude'.
+ latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'latitude'.
speed_column: The name of the column with speed data in a floating point format. Defaults to 'double_speed'.
resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}.
"""
@@ -345,8 +345,8 @@ def location_significant_place_features(df, config=None):
config = {}
assert isinstance(config, dict), "config is not a dictionary"
- latitude_column = config.get("latitude_column", "double_latitude")
- longitude_column = config.get("longitude_column", "double_latitude")
+ latitude_column = config.get("latitude_column", "latitude")
+ longitude_column = config.get("longitude_column", "latitude")
speed_column = config.get("speed_column", "double_speed")
speed_threshold = config.get("speed_threshold", 0.277)
config["resample_args"] = config.get("resample_args", {"rule": default_freq})
@@ -436,8 +436,8 @@ def location_distance_features(df, config=None):
config: A dictionary of optional arguments
Optional arguments in config:
- longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'double_longitude'.
- latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'double_latitude'.
+ longitude_column: The name of the column with longitude data in a floating point format. Defaults to 'longitude'.
+ latitude_column: The name of the column with latitude data in a floating point format. Defaults to 'latitude'.
speed_column: The name of the column with speed data in a floating point format. Defaults to 'double_speed'.
resample_args: a dictionary of arguments for the Pandas resample function. For example to resample by hour, you would pass {"rule": "1h"}.
"""
@@ -446,8 +446,8 @@ def location_distance_features(df, config=None):
config = {}
assert isinstance(config, dict), "config is not a dictionary"
- latitude_column = config.get("latitude_column", "double_latitude")
- longitude_column = config.get("longitude_column", "double_latitude")
+ latitude_column = config.get("latitude_column", "latitude")
+ longitude_column = config.get("longitude_column", "latitude")
speed_column = config.get("speed_column", "double_speed")
config["resample_args"] = config.get("resample_args", {"rule": default_freq})
@@ -505,7 +505,7 @@ def extract_features_location(df, features=None):
----------
df : pd.DataFrame
dataframe of location data. It must contain these columns:
- `double_latitude`, `double_longitude`, `user`, `group`.
+ `latitude`, `longitude`, `user`, `group`.
`double_speed` is optional. If not provided, it will be
computed manually.
speed_threshold : float