Skip to content

Commit

Permalink
Merge pull request #2 from UtrechtUniversity/web-tweaks
Browse files Browse the repository at this point in the history
Web tweaks
  • Loading branch information
vloothuis authored Jun 18, 2021
2 parents 14ab0cc + ef3ece3 commit 3844e44
Show file tree
Hide file tree
Showing 16 changed files with 735 additions and 29 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/on_pull_request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Run pylint and pytest
name: Build-Lint-Test
on: [pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Check out the code
uses: actions/checkout@v1
with:
fetch-depth: 1

- name: Set up Python 3.8
uses: actions/setup-python@v1
with:
python-version: 3.8

- name: Install Poetry
uses: snok/[email protected]

- name: Cache Poetry installed packages
uses: actions/cache@v1
id: cache
with:
path: .venv
key: poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: poetry-

- name: Install Dependencies using Poetry
working-directory: data_extractor
run: poetry install
if: steps.cache.outputs.cache-hit != 'true'

- name: Pylint
working-directory: data_extractor
run: poetry run pylint google_semantic_location_history

- name: Flake8
working-directory: data_extractor
run: poetry run flake8 google_semantic_location_history

- name: Pytest
working-directory: data_extractor
run: poetry run pytest -v --cov=google_semantic_location_history --cov=data_extractor --cov-fail-under=80 tests/
1 change: 1 addition & 0 deletions .tool-versions
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python 3.8.7
3 changes: 3 additions & 0 deletions data_extractor/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length = 100
exclude = tests/*
5 changes: 5 additions & 0 deletions data_extractor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin/
lib/
lib64
pyvenv.cfg
dist
4 changes: 4 additions & 0 deletions data_extractor/.pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[FORMAT]

# Maximum number of characters on a single line.
max-line-length=100
5 changes: 2 additions & 3 deletions data_extractor/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ The behavior of the ``process`` function can be verified by running the tests.
The test are located in the ``tests`` folder. To run the tests execute:
``poetry run pytest``.

To run the extraction code from the browser first run ``poetry build`` to
create an updated version of the Python Wheel (package).
Now run: ``python3 -m http.server`` from the root folder (the one with
To run the extraction code from the browser run:
``python3 -m http.server`` from the root folder (the one with
``.git``). This will start a webserver on:
`localhost <http://localhost:8000>`__.

Expand Down
1 change: 1 addition & 0 deletions data_extractor/data_extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import zipfile


def process(file_data):
names = []
data = []
Expand Down
Binary file not shown.
Binary file removed data_extractor/dist/data_extractor-0.1.0.tar.gz
Binary file not shown.
134 changes: 134 additions & 0 deletions data_extractor/google_semantic_location_history/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Script to extract data from Google Semantic History Location zipfile"""
__version__ = '0.1.0'

import json
import itertools
import re
import zipfile

import pandas as pd


# years and months to extract data for
YEARS = [2019, 2020, 2021]
MONTHS = ["JANUARY"]
NPLACES = 3
TEXT = "This study examines the change in travel behaviour during the COVID-19 pandemic. \
We therefore examined your Google semantic Location History data for January in 2019, \
2020, and 2021. To be precise, we extracted per month the total number of visited places, \
and the number of days spend per place for the three most visited places. Also, we extracted \
the number of days spend in places and travelling, and the travelled distance in km."


def __visit_duration(data):
"""Get duration per visited place
Args:
data (dict): Google Semantic Location History data
Returns:
dict: duration per visited place, sorted in descending order
"""
placevisit_duration = []
for data_unit in data["timelineObjects"]:
if "placeVisit" in data_unit:
address = data_unit["placeVisit"]["location"]["placeId"]
start_time = data_unit["placeVisit"]["duration"]["startTimestampMs"]
end_time = data_unit["placeVisit"]["duration"]["endTimestampMs"]
placevisit_duration.append(
{address: (int(end_time) - int(start_time))/(1e3*24*60*60)})

# list of places visited
address_list = {next(iter(duration)) for duration in placevisit_duration}

# dict of time spend per place
places = {}
for address in address_list:
places[address] = round(sum(
[duration[address] for duration in placevisit_duration
if address == list(duration.keys())[0]]), 3)
# Sort places to amount of time spend
places = dict(sorted(places.items(), key=lambda kv: kv[1], reverse=True))

return places


def __activity_duration(data):
"""Get total duration of activities
Args:
data (dict): Google Semantic Location History data
Returns:
float: duration of actitvities in days
"""
activity_duration = 0.0
for data_unit in data["timelineObjects"]:
if "activitySegment" in data_unit.keys():
start_time = data_unit["activitySegment"]["duration"]["startTimestampMs"]
end_time = data_unit["activitySegment"]["duration"]["endTimestampMs"]
activity_duration += (int(end_time) - int(start_time))/(1e3*24*60*60)
return activity_duration


def __activity_distance(data):
"""Get total distance of activities
Args:
data (dict): Google Semantic Location History data
Returns:
float: duration of actitvities in days
"""
activity_distance = 0.0
for data_unit in data["timelineObjects"]:
if "activitySegment" in data_unit.keys():
activity_distance += int(data_unit["activitySegment"]["distance"])/1000.0

return activity_distance


def process(file_data):
"""Return relevant data from zipfile for years and months
Args:
file_data: zip file or object
Returns:
dict: dict with summary and DataFrame with extracted data
"""
results = []
filenames = []

# Extract info from selected years and months
with zipfile.ZipFile(file_data) as zfile:
file_list = zfile.namelist()
for year in YEARS:
for month in MONTHS:
for name in file_list:
monthfile = f"{year}_{month}.json"
if re.search(monthfile, name) is not None:
filenames.append(monthfile)
data = json.loads(zfile.read(name).decode("utf8"))
places = __visit_duration(data)
results.append({
"Year": year,
"Month": month,
"Top Places": dict(itertools.islice(places.items(), NPLACES)),
"Number of Places": len(places),
"Places Duration [days]": round(
sum(value for value in places.values()), 3),
"Activity Duration [days]": round(__activity_duration(data), 3),
"Activity Distance [km]": round(__activity_distance(data), 3)
})
break

# Put results in DataFrame
data_frame = pd.json_normalize(results)

# Anonymize by replace PlaceIds with numbers
number = 0
for column in data_frame.columns:
if column.split(".")[0] == "Top Places":
number += 1
data_frame.rename(columns={column: f"Place {number} [days]"}, inplace=True)

return {
"summary": TEXT,
"data_frames": [
data_frame.fillna(0)
]
}
14 changes: 14 additions & 0 deletions data_extractor/google_semantic_location_history/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"Main program to test google_semantic_location history script"
import io
import pandas as pd
from google_semantic_location_history import process


if __name__ == '__main__':
result = process("tests/data/Location History.zip")
print("\nRaw result:")
print(result)
data_frame = pd.read_csv(io.StringIO(result["data"]), sep=",")
pd.options.display.max_columns = 9
print("\nDataframe:")
print(data_frame)
Loading

0 comments on commit 3844e44

Please sign in to comment.