From 75b556829f0660a301c9f58cfd9f831aa79a329a Mon Sep 17 00:00:00 2001 From: Rantaharju Jarno Date: Mon, 9 Dec 2024 11:35:07 +0200 Subject: [PATCH] Set timezone in google takeout. Add util to readers --- niimpy/preprocessing/util.py | 12 ------ niimpy/reading/google_takeout.py | 66 +++++++++++++++++++++++++------- niimpy/reading/util.py | 19 +++++++++ 3 files changed, 72 insertions(+), 25 deletions(-) create mode 100644 niimpy/reading/util.py diff --git a/niimpy/preprocessing/util.py b/niimpy/preprocessing/util.py index dbfc3983..1aa6a141 100644 --- a/niimpy/preprocessing/util.py +++ b/niimpy/preprocessing/util.py @@ -163,18 +163,6 @@ def to_datetime(value): return times.dt.tz_convert(TZ) else: return times.tz_convert(TZ) - - -def format_column_names(df): - # Replace special characters, including space and ., with _ - # (keeping parenthesis and /, which are used in units, e.g. "temperature (C)") - # Convert to lower case - column_map = {} - for column in df.columns: - formatted_name = column.replace(" ", "_").lower() - formatted_name = re.sub(r'[^a-zA-Z0-9_()/]+', '_', formatted_name) - column_map[column] = formatted_name - df.rename(columns=column_map, inplace=True) def identifier_columns(df, id_columns = ["user", "device", "group"]): diff --git a/niimpy/reading/google_takeout.py b/niimpy/reading/google_takeout.py index fdf5fcc3..dde23989 100644 --- a/niimpy/reading/google_takeout.py +++ b/niimpy/reading/google_takeout.py @@ -2,7 +2,6 @@ from zipfile import ZipFile import json import os -import datetime import email import uuid import warnings @@ -10,7 +9,7 @@ from tqdm import tqdm from bs4 import BeautifulSoup -from niimpy.preprocessing import util +from niimpy.reading import util import google_takeout_email as email_utils try: @@ -126,6 +125,7 @@ def location_history( user = None, start_date = None, end_date = None, + timezone = "Europe/Helsinki" ): """ Read the location history from a google takeout zip file. @@ -210,6 +210,7 @@ def location_history( data.drop(drop_columns, axis=1, inplace=True, errors='ignore') data.rename(columns=column_name_map, inplace=True) util.format_column_names(data) + util.set_timezone(data, tz=timezone) if user is None: user = uuid.uuid1() @@ -255,10 +256,11 @@ def activity(zip_filename, user=None, timezone = 'Europe/Helsinki'): # Read the more fine grained data for each date data = pd.read_csv(zip_file.open(filename)) - data["timestamp"] = pd.to_datetime(data["Date"]).dt.tz_localize(timezone) + data["timestamp"] = pd.to_datetime(data["Date"]) data.set_index('timestamp', inplace=True) util.format_column_names(data) + util.set_timezone(data, tz=timezone) if user is None: user = uuid.uuid1() @@ -358,7 +360,14 @@ def close(self): self.zip_file.close() -def sentiment_analysis_from_email(df, filename, sentiment_batch_size=100, start_date=None, end_date=None): +def sentiment_analysis_from_email( + df, + filename, + sentiment_batch_size=100, + start_date=None, + end_date=None, + timezone = "Europe/Helsinki" + ): """ Run sentiment analysis on the content of the email messages in the dataframe. """ content_batch = [] @@ -395,6 +404,7 @@ def sentiment_analysis_from_email(df, filename, sentiment_batch_size=100, start_ scores = [s["score"] for s in sentiments] df["sentiment"] = labels df["sentiment_score"] = scores + util.set_timezone(df, tz=timezone) return df @@ -407,6 +417,7 @@ def email_activity( sentiment_batch_size = 100, start_date = None, end_date = None, + timezone = "Europe/Helsinki" ): """ Extract message header data from the GMail inbox in a Google Takeout zip file. @@ -544,6 +555,8 @@ def email_activity( df["user"] = user df.set_index("timestamp", inplace=True) + util.format_column_names(df) + util.set_timezone(df, tz=timezone) # Run sentiment analysis if requested. This might take some time. if sentiment: @@ -554,7 +567,11 @@ def email_activity( -def sentiment_analysis_from_text_column(df, text_content_column, sentiment_batch_size=100): +def sentiment_analysis_from_text_column(df, + text_content_column, + sentiment_batch_size=100, + timezone = "Europe/Helsinki" + ): """ Run sentiment analysis on a dataframe with text content and add the results as new columns. """ content_batch = [] @@ -573,6 +590,8 @@ def sentiment_analysis_from_text_column(df, text_content_column, sentiment_batch scores = [s["score"] for s in sentiments] df["sentiment"] = labels df["sentiment_score"] = scores + util.format_column_names(df) + util.set_timezone(df, tz=timezone) return df @@ -583,7 +602,8 @@ def chat( sentiment=False, sentiment_batch_size = 100, pseudonymize=True, start_date = None, - end_date = None + end_date = None, + timezone = "Europe/Helsinki" ): """ Read Google chat messages from a Google Takeout zip file. @@ -689,11 +709,19 @@ def chat( df.drop("text", axis=1, inplace=True) util.format_column_names(df) + util.set_timezone(df, tz=timezone) return df -def youtube_watch_history(zip_filename, user=None, pseudonymize=True, start_date = None, end_date = None): +def youtube_watch_history( + zip_filename, + user=None, + pseudonymize=True, + start_date = None, + end_date = None, + timezone = "Europe/Helsinki" + ): """ Read the watch history from a Google Takeout zip file. Watch history is stored as an html file. We parse the file @@ -773,6 +801,8 @@ def youtube_watch_history(zip_filename, user=None, pseudonymize=True, start_date df["video_title"] = df["video_title"].astype("category").cat.codes df["channel_title"] = df["channel_title"].astype("category").cat.codes + util.format_column_names(df) + util.set_timezone(df, tz=timezone) return df @@ -849,7 +879,11 @@ def fit_expand_data_filename(zip_filename, filename): return filenames -def fit_read_data_file(zip_filename, data_filename): +def fit_read_data_file( + zip_filename, + data_filename, + timezone = "Europe/Helsinki" + ): """ Read a data file in the Google Fit All Data folder. """ try: @@ -935,10 +969,15 @@ def process_fitValue(value, parent_index=None): df.drop("originDataSourceId", axis=1, inplace=True) util.format_column_names(df) + util.set_timezone(df, tz=timezone) return df -def fit_read_data(zip_filename, data_filename): +def fit_read_data( + zip_filename, + data_filename, + timezone = "Europe/Helsinki" + ): """ Read multiple data files in the Google Fit All Data folder. """ @@ -964,11 +1003,11 @@ def fit_read_data(zip_filename, data_filename): df = pd.concat(dfs) df.sort_index(inplace=True) - + util.set_timezone(df, tz=timezone) return df -def fit_all_data(zip_filename): +def fit_all_data(zip_filename, timezone = "Europe/Helsinki"): """ Read all the data in the Google Fit All Data folder. """ datafiles = fit_list_data(zip_filename)["filename"] @@ -976,7 +1015,7 @@ def fit_all_data(zip_filename): return data -def fit_heart_rate_data(zip_filename): +def fit_heart_rate_data(zip_filename, timezone = "Europe/Helsinki"): """ Read heart rate data from Google Fit All Data folder and format it more nicely. @@ -1000,7 +1039,7 @@ def fit_heart_rate_data(zip_filename): return df -def fit_sessions(zip_filename): +def fit_sessions(zip_filename, timezone = "Europe/Helsinki"): """ Read all Google Takeout sessions and concatenate them into a dataframe. Each file contains aggregate data for a single activity session or sleep session. @@ -1043,6 +1082,7 @@ def fit_sessions(zip_filename): df["duration"] = pd.to_timedelta(df["duration"]) util.format_column_names(df) + util.set_timezone(df, tz=timezone) return df diff --git a/niimpy/reading/util.py b/niimpy/reading/util.py new file mode 100644 index 00000000..9bd3748e --- /dev/null +++ b/niimpy/reading/util.py @@ -0,0 +1,19 @@ +import re + +def format_column_names(df): + # Replace special characters, including space and ., with _ + # (keeping parenthesis and /, which are used in units, e.g. "temperature (C)") + # Convert to lower case + column_map = {} + for column in df.columns: + formatted_name = column.replace(" ", "_").lower() + formatted_name = re.sub(r'[^a-zA-Z0-9_()/]+', '_', formatted_name) + column_map[column] = formatted_name + df.rename(columns=column_map, inplace=True) + +def set_timezone(df, tz = 'Europe/Helsinki'): + """ Set the timezone of the datetime object in the index column """ + if df.index.tzinfo is None: + df.index = df.index.tz_localize(tz) + return df +