Merge pull request #2 from UtrechtUniversity/web-tweaks

Web tweaks
eyra · Jun 18, 2021 · 3844e44 · 3844e44
2 parents 14ab0cc + ef3ece3
commit 3844e44
Show file tree

Hide file tree

Showing 16 changed files with 735 additions and 29 deletions.
diff --git a/.github/workflows/on_pull_request.yml b/.github/workflows/on_pull_request.yml
@@ -0,0 +1,44 @@
+# Run pylint and pytest
+name: Build-Lint-Test
+on: [pull_request]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the code
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 1
+
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+
+      - name: Install Poetry
+        uses: snok/[email protected]
+
+      - name: Cache Poetry installed packages
+        uses: actions/cache@v1
+        id: cache
+        with:
+          path: .venv
+          key: poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: poetry-
+
+      - name: Install Dependencies using Poetry
+        working-directory: data_extractor
+        run: poetry install
+        if: steps.cache.outputs.cache-hit != 'true'
+
+      - name: Pylint
+        working-directory: data_extractor
+        run: poetry run pylint google_semantic_location_history
+
+      - name: Flake8
+        working-directory: data_extractor
+        run: poetry run flake8 google_semantic_location_history
+
+      - name: Pytest
+        working-directory: data_extractor      
+        run: poetry run pytest -v --cov=google_semantic_location_history --cov=data_extractor --cov-fail-under=80 tests/
diff --git a/.tool-versions b/.tool-versions
@@ -0,0 +1 @@
+python 3.8.7
diff --git a/data_extractor/.flake8 b/data_extractor/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 100
+exclude = tests/*
diff --git a/data_extractor/.gitignore b/data_extractor/.gitignore
@@ -0,0 +1,5 @@
+bin/
+lib/
+lib64
+pyvenv.cfg
+dist
diff --git a/data_extractor/.pylintrc b/data_extractor/.pylintrc
@@ -0,0 +1,4 @@
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
diff --git a/data_extractor/README.rst b/data_extractor/README.rst
@@ -18,9 +18,8 @@ The behavior of the ``process`` function can be verified by running the tests.
 The test are located in the ``tests`` folder. To run the tests execute:
 ``poetry run pytest``.
 
-To run the extraction code from the browser first run ``poetry build`` to
-create an updated version of the Python Wheel (package).
-Now run: ``python3 -m http.server`` from the root folder (the one with
+To run the extraction code from the browser run: 
+``python3 -m http.server`` from the root folder (the one with
 ``.git``). This will start a webserver on: 
 `localhost <http://localhost:8000>`__.
 

diff --git a/data_extractor/data_extractor/__init__.py b/data_extractor/data_extractor/__init__.py
@@ -2,6 +2,7 @@
 
 import zipfile
 
+
 def process(file_data):
     names = []
     data = []

diff --git a/data_extractor/dist/data_extractor-0.1.0-py3-none-any.whl b/data_extractor/dist/data_extractor-0.1.0-py3-none-any.whl
diff --git a/data_extractor/dist/data_extractor-0.1.0.tar.gz b/data_extractor/dist/data_extractor-0.1.0.tar.gz
diff --git a/data_extractor/google_semantic_location_history/__init__.py b/data_extractor/google_semantic_location_history/__init__.py
@@ -0,0 +1,134 @@
+"""Script to extract data from Google Semantic History Location zipfile"""
+__version__ = '0.1.0'
+
+import json
+import itertools
+import re
+import zipfile
+
+import pandas as pd
+
+
+# years and months to extract data for
+YEARS = [2019, 2020, 2021]
+MONTHS = ["JANUARY"]
+NPLACES = 3
+TEXT = "This study examines the change in travel behaviour during the COVID-19 pandemic. \
+We therefore examined your Google semantic Location History data for January in 2019, \
+2020, and 2021. To be precise, we extracted per month the total number of visited places, \
+and the number of days spend per place for the three most visited places. Also, we extracted \
+the number of days spend in places and travelling, and the travelled distance in km."
+
+
+def __visit_duration(data):
+    """Get duration per visited place
+    Args:
+        data (dict): Google Semantic Location History data
+    Returns:
+        dict: duration per visited place, sorted in descending order
+    """
+    placevisit_duration = []
+    for data_unit in data["timelineObjects"]:
+        if "placeVisit" in data_unit:
+            address = data_unit["placeVisit"]["location"]["placeId"]
+            start_time = data_unit["placeVisit"]["duration"]["startTimestampMs"]
+            end_time = data_unit["placeVisit"]["duration"]["endTimestampMs"]
+            placevisit_duration.append(
+                {address: (int(end_time) - int(start_time))/(1e3*24*60*60)})
+
+    # list of places visited
+    address_list = {next(iter(duration)) for duration in placevisit_duration}
+
+    # dict of time spend per place
+    places = {}
+    for address in address_list:
+        places[address] = round(sum(
+            [duration[address] for duration in placevisit_duration
+                if address == list(duration.keys())[0]]), 3)
+    # Sort places to amount of time spend
+    places = dict(sorted(places.items(), key=lambda kv: kv[1], reverse=True))
+
+    return places
+
+
+def __activity_duration(data):
+    """Get total duration of activities
+    Args:
+        data (dict): Google Semantic Location History data
+    Returns:
+        float: duration of actitvities in days
+    """
+    activity_duration = 0.0
+    for data_unit in data["timelineObjects"]:
+        if "activitySegment" in data_unit.keys():
+            start_time = data_unit["activitySegment"]["duration"]["startTimestampMs"]
+            end_time = data_unit["activitySegment"]["duration"]["endTimestampMs"]
+            activity_duration += (int(end_time) - int(start_time))/(1e3*24*60*60)
+    return activity_duration
+
+
+def __activity_distance(data):
+    """Get total distance of activities
+    Args:
+        data (dict): Google Semantic Location History data
+    Returns:
+        float: duration of actitvities in days
+    """
+    activity_distance = 0.0
+    for data_unit in data["timelineObjects"]:
+        if "activitySegment" in data_unit.keys():
+            activity_distance += int(data_unit["activitySegment"]["distance"])/1000.0
+
+    return activity_distance
+
+
+def process(file_data):
+    """Return relevant data from zipfile for years and months
+    Args:
+        file_data: zip file or object
+
+    Returns:
+        dict: dict with summary and DataFrame with extracted data
+    """
+    results = []
+    filenames = []
+
+    # Extract info from selected years and months
+    with zipfile.ZipFile(file_data) as zfile:
+        file_list = zfile.namelist()
+        for year in YEARS:
+            for month in MONTHS:
+                for name in file_list:
+                    monthfile = f"{year}_{month}.json"
+                    if re.search(monthfile, name) is not None:
+                        filenames.append(monthfile)
+                        data = json.loads(zfile.read(name).decode("utf8"))
+                        places = __visit_duration(data)
+                        results.append({
+                            "Year": year,
+                            "Month": month,
+                            "Top Places": dict(itertools.islice(places.items(), NPLACES)),
+                            "Number of Places": len(places),
+                            "Places Duration [days]": round(
+                                sum(value for value in places.values()), 3),
+                            "Activity Duration [days]": round(__activity_duration(data), 3),
+                            "Activity Distance [km]": round(__activity_distance(data), 3)
+                        })
+                        break
+
+    # Put results in DataFrame
+    data_frame = pd.json_normalize(results)
+
+    # Anonymize by replace PlaceIds with numbers
+    number = 0
+    for column in data_frame.columns:
+        if column.split(".")[0] == "Top Places":
+            number += 1
+            data_frame.rename(columns={column: f"Place {number} [days]"}, inplace=True)
+
+    return {
+        "summary": TEXT,
+        "data_frames": [
+            data_frame.fillna(0)
+        ]
+    }
diff --git a/data_extractor/google_semantic_location_history/main.py b/data_extractor/google_semantic_location_history/main.py
@@ -0,0 +1,14 @@
+"Main program to test google_semantic_location history script"
+import io
+import pandas as pd
+from google_semantic_location_history import process
+
+
+if __name__ == '__main__':
+    result = process("tests/data/Location History.zip")
+    print("\nRaw result:")
+    print(result)
+    data_frame = pd.read_csv(io.StringIO(result["data"]), sep=",")
+    pd.options.display.max_columns = 9
+    print("\nDataframe:")
+    print(data_frame)