Skip to content

Commit

Permalink
Set timezone in google takeout. Add util to readers
Browse files Browse the repository at this point in the history
  • Loading branch information
rantahar committed Dec 9, 2024
1 parent 65a33b9 commit 75b5568
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 25 deletions.
12 changes: 0 additions & 12 deletions niimpy/preprocessing/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,6 @@ def to_datetime(value):
return times.dt.tz_convert(TZ)
else:
return times.tz_convert(TZ)


def format_column_names(df):
# Replace special characters, including space and ., with _
# (keeping parenthesis and /, which are used in units, e.g. "temperature (C)")
# Convert to lower case
column_map = {}
for column in df.columns:
formatted_name = column.replace(" ", "_").lower()
formatted_name = re.sub(r'[^a-zA-Z0-9_()/]+', '_', formatted_name)
column_map[column] = formatted_name
df.rename(columns=column_map, inplace=True)


def identifier_columns(df, id_columns = ["user", "device", "group"]):
Expand Down
66 changes: 53 additions & 13 deletions niimpy/reading/google_takeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
from zipfile import ZipFile
import json
import os
import datetime
import email
import uuid
import warnings
import re

from tqdm import tqdm
from bs4 import BeautifulSoup
from niimpy.preprocessing import util
from niimpy.reading import util
import google_takeout_email as email_utils

try:
Expand Down Expand Up @@ -126,6 +125,7 @@ def location_history(
user = None,
start_date = None,
end_date = None,
timezone = "Europe/Helsinki"
):
""" Read the location history from a google takeout zip file.
Expand Down Expand Up @@ -210,6 +210,7 @@ def location_history(
data.drop(drop_columns, axis=1, inplace=True, errors='ignore')
data.rename(columns=column_name_map, inplace=True)
util.format_column_names(data)
util.set_timezone(data, tz=timezone)

if user is None:
user = uuid.uuid1()
Expand Down Expand Up @@ -255,10 +256,11 @@ def activity(zip_filename, user=None, timezone = 'Europe/Helsinki'):
# Read the more fine grained data for each date
data = pd.read_csv(zip_file.open(filename))

data["timestamp"] = pd.to_datetime(data["Date"]).dt.tz_localize(timezone)
data["timestamp"] = pd.to_datetime(data["Date"])

data.set_index('timestamp', inplace=True)
util.format_column_names(data)
util.set_timezone(data, tz=timezone)

if user is None:
user = uuid.uuid1()
Expand Down Expand Up @@ -358,7 +360,14 @@ def close(self):
self.zip_file.close()


def sentiment_analysis_from_email(df, filename, sentiment_batch_size=100, start_date=None, end_date=None):
def sentiment_analysis_from_email(
df,
filename,
sentiment_batch_size=100,
start_date=None,
end_date=None,
timezone = "Europe/Helsinki"
):
""" Run sentiment analysis on the content of the email messages
in the dataframe. """
content_batch = []
Expand Down Expand Up @@ -395,6 +404,7 @@ def sentiment_analysis_from_email(df, filename, sentiment_batch_size=100, start_
scores = [s["score"] for s in sentiments]
df["sentiment"] = labels
df["sentiment_score"] = scores
util.set_timezone(df, tz=timezone)

return df

Expand All @@ -407,6 +417,7 @@ def email_activity(
sentiment_batch_size = 100,
start_date = None,
end_date = None,
timezone = "Europe/Helsinki"
):
""" Extract message header data from the GMail inbox in
a Google Takeout zip file.
Expand Down Expand Up @@ -544,6 +555,8 @@ def email_activity(
df["user"] = user

df.set_index("timestamp", inplace=True)
util.format_column_names(df)
util.set_timezone(df, tz=timezone)

# Run sentiment analysis if requested. This might take some time.
if sentiment:
Expand All @@ -554,7 +567,11 @@ def email_activity(



def sentiment_analysis_from_text_column(df, text_content_column, sentiment_batch_size=100):
def sentiment_analysis_from_text_column(df,
text_content_column,
sentiment_batch_size=100,
timezone = "Europe/Helsinki"
):
""" Run sentiment analysis on a dataframe with text content
and add the results as new columns. """
content_batch = []
Expand All @@ -573,6 +590,8 @@ def sentiment_analysis_from_text_column(df, text_content_column, sentiment_batch
scores = [s["score"] for s in sentiments]
df["sentiment"] = labels
df["sentiment_score"] = scores
util.format_column_names(df)
util.set_timezone(df, tz=timezone)
return df


Expand All @@ -583,7 +602,8 @@ def chat(
sentiment=False, sentiment_batch_size = 100,
pseudonymize=True,
start_date = None,
end_date = None
end_date = None,
timezone = "Europe/Helsinki"
):
""" Read Google chat messages from a Google Takeout zip file.
Expand Down Expand Up @@ -689,11 +709,19 @@ def chat(
df.drop("text", axis=1, inplace=True)

util.format_column_names(df)
util.set_timezone(df, tz=timezone)
return df



def youtube_watch_history(zip_filename, user=None, pseudonymize=True, start_date = None, end_date = None):
def youtube_watch_history(
zip_filename,
user=None,
pseudonymize=True,
start_date = None,
end_date = None,
timezone = "Europe/Helsinki"
):
""" Read the watch history from a Google Takeout zip file.
Watch history is stored as an html file. We parse the file
Expand Down Expand Up @@ -773,6 +801,8 @@ def youtube_watch_history(zip_filename, user=None, pseudonymize=True, start_date
df["video_title"] = df["video_title"].astype("category").cat.codes
df["channel_title"] = df["channel_title"].astype("category").cat.codes

util.format_column_names(df)
util.set_timezone(df, tz=timezone)
return df


Expand Down Expand Up @@ -849,7 +879,11 @@ def fit_expand_data_filename(zip_filename, filename):
return filenames


def fit_read_data_file(zip_filename, data_filename):
def fit_read_data_file(
zip_filename,
data_filename,
timezone = "Europe/Helsinki"
):
""" Read a data file in the Google Fit All Data folder.
"""
try:
Expand Down Expand Up @@ -935,10 +969,15 @@ def process_fitValue(value, parent_index=None):
df.drop("originDataSourceId", axis=1, inplace=True)

util.format_column_names(df)
util.set_timezone(df, tz=timezone)
return df


def fit_read_data(zip_filename, data_filename):
def fit_read_data(
zip_filename,
data_filename,
timezone = "Europe/Helsinki"
):
""" Read multiple data files in the Google Fit All Data folder.
"""

Expand All @@ -964,19 +1003,19 @@ def fit_read_data(zip_filename, data_filename):

df = pd.concat(dfs)
df.sort_index(inplace=True)

util.set_timezone(df, tz=timezone)
return df


def fit_all_data(zip_filename):
def fit_all_data(zip_filename, timezone = "Europe/Helsinki"):
""" Read all the data in the Google Fit All Data folder.
"""
datafiles = fit_list_data(zip_filename)["filename"]
data = fit_read_data(zip_filename, datafiles)
return data


def fit_heart_rate_data(zip_filename):
def fit_heart_rate_data(zip_filename, timezone = "Europe/Helsinki"):
""" Read heart rate data from Google Fit All Data folder and
format it more nicely.
Expand All @@ -1000,7 +1039,7 @@ def fit_heart_rate_data(zip_filename):
return df


def fit_sessions(zip_filename):
def fit_sessions(zip_filename, timezone = "Europe/Helsinki"):
""" Read all Google Takeout sessions and concatenate them into
a dataframe. Each file contains aggregate data for a single
activity session or sleep session.
Expand Down Expand Up @@ -1043,6 +1082,7 @@ def fit_sessions(zip_filename):
df["duration"] = pd.to_timedelta(df["duration"])

util.format_column_names(df)
util.set_timezone(df, tz=timezone)

return df

Expand Down
19 changes: 19 additions & 0 deletions niimpy/reading/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import re

def format_column_names(df):
# Replace special characters, including space and ., with _
# (keeping parenthesis and /, which are used in units, e.g. "temperature (C)")
# Convert to lower case
column_map = {}
for column in df.columns:
formatted_name = column.replace(" ", "_").lower()
formatted_name = re.sub(r'[^a-zA-Z0-9_()/]+', '_', formatted_name)
column_map[column] = formatted_name
df.rename(columns=column_map, inplace=True)

def set_timezone(df, tz = 'Europe/Helsinki'):
""" Set the timezone of the datetime object in the index column """
if df.index.tzinfo is None:
df.index = df.index.tz_localize(tz)
return df

0 comments on commit 75b5568

Please sign in to comment.