diff --git a/.gitignore b/.gitignore index 6915242..2378641 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. -.vscode/ *.pyc +.vscode/ + +__pycache__ + # dependencies /node_modules /.pnp diff --git a/dist/framework/processing/py_worker.js b/dist/framework/processing/py_worker.js index e8910ab..76a5cd5 100644 --- a/dist/framework/processing/py_worker.js +++ b/dist/framework/processing/py_worker.js @@ -82,5 +82,5 @@ function loadPackages() { } function installPortPackage() { console.log('[ProcessingWorker] load port package'); - return self.pyodide.runPythonAsync("\n import micropip\n await micropip.install(\"./port-0.0.0-py3-none-any.whl\", deps=False)\n import port\n "); + return self.pyodide.runPythonAsync("\n import micropip\n await micropip.install(\"../../port-0.0.0-py3-none-any.whl\", deps=False)\n import port\n "); } diff --git a/dist/port-0.0.0-py3-none-any.whl b/dist/port-0.0.0-py3-none-any.whl index 32885b6..02e55d5 100644 Binary files a/dist/port-0.0.0-py3-none-any.whl and b/dist/port-0.0.0-py3-none-any.whl differ diff --git a/public/port-0.0.0-py3-none-any.whl b/public/port-0.0.0-py3-none-any.whl index 32885b6..d889132 100644 Binary files a/public/port-0.0.0-py3-none-any.whl and b/public/port-0.0.0-py3-none-any.whl differ diff --git a/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl b/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl index 32885b6..4772943 100644 Binary files a/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl and b/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl differ diff --git a/src/framework/processing/py/poetry.lock b/src/framework/processing/py/poetry.lock index a182092..969dd9d 100644 --- a/src/framework/processing/py/poetry.lock +++ b/src/framework/processing/py/poetry.lock @@ -1,48 +1,173 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + [[package]] name = "certifi" version = "2022.9.24" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" +files = [ + {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"}, + {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"}, +] [[package]] name = "charset-normalizer" version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.6.0" +files = [ + {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, + {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, +] [package.extras] -unicode_backport = ["unicodedata2"] +unicode-backport = ["unicodedata2"] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.1.3" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, + {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, +] + +[package.extras] +test = ["pytest (>=6)"] [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + +[[package]] +name = "importlib-metadata" +version = "6.7.0" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "importlib_metadata-6.7.0-py3-none-any.whl", hash = "sha256:cb52082e659e97afc5dac71e79de97d8681de3aa07ff18578330904a9d18e5b5"}, + {file = "importlib_metadata-6.7.0.tar.gz", hash = "sha256:1aaf550d4f73e5d6783e7acb77aec43d49da8017410afae93822cc9cca98c4d4"}, +] + +[package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} +zipp = ">=0.5" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +perf = ["ipython"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] [[package]] name = "panda" version = "0.3.1" description = "A Python implementation of the Panda REST interface" -category = "main" optional = false python-versions = "*" +files = [ + {file = "panda-0.3.1.tar.gz", hash = "sha256:f213b848f09268b3e9fce0e103155ab003217c0e27f6048b6194e7f90bb2b716"}, +] [package.dependencies] requests = "*" +setuptools = "*" + +[[package]] +name = "pluggy" +version = "1.2.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "7.4.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"}, + {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "requests" version = "2.28.1" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7, <4" +files = [ + {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, + {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, +] [package.dependencies] certifi = ">=2017.4.17" @@ -52,30 +177,78 @@ urllib3 = ">=1.21.1,<1.27" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "setuptools" +version = "68.0.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"}, + {file = "setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "typing-extensions" +version = "4.7.1" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, +] [[package]] name = "urllib3" version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" +files = [ + {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, + {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, +] [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +[[package]] +name = "zipp" +version = "3.15.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.7" +files = [ + {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, + {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + [metadata] -lock-version = "1.1" +lock-version = "2.0" python-versions = "^3.7" -content-hash = "1e6758195fd2eac25f04a0291b9cda25155f1805b98ca879f38c7f76c6153653" - -[metadata.files] -certifi = [] -charset-normalizer = [] -idna = [] -panda = [] -requests = [] -urllib3 = [] \ No newline at end of file +content-hash = "4e87005a84a3f69b738ac0b93da24a30b5203cdc8c63102f7eee3d96792822a9" diff --git a/src/framework/processing/py/port/__init__.py b/src/framework/processing/py/port/__init__.py index 9dd0e93..d067edc 100644 --- a/src/framework/processing/py/port/__init__.py +++ b/src/framework/processing/py/port/__init__.py @@ -1,5 +1,3 @@ from port.main import start -__all__ = [ - "start" -] +__all__ = ["start"] diff --git a/src/framework/processing/py/port/api/props.py b/src/framework/processing/py/port/api/props.py index db8aa78..734fc4a 100644 --- a/src/framework/processing/py/port/api/props.py +++ b/src/framework/processing/py/port/api/props.py @@ -11,13 +11,15 @@ class Translations(TypedDict): en: English string to display nl: Dutch string to display """ + en: str nl: str @dataclass class Translatable: - """Wrapper class for Translations""" + """Wrapper class for Translations""" + translations: Translations def toDict(self): @@ -31,6 +33,7 @@ class PropsUIHeader: Attributes: title: title of the page """ + title: Translatable def toDict(self): @@ -47,6 +50,7 @@ class PropsUIFooter: Attributes: progressPercentage: float indicating the progress in the flow """ + progressPercentage: float def toDict(self): @@ -60,14 +64,15 @@ def toDict(self): class PropsUIPromptConfirm: """Retry submitting a file page - Prompt the user if they want to submit a new file. - This can be used in case a file could not be processed. + Prompt the user if they want to submit a new file. + This can be used in case a file could not be processed. Attributes: text: message to display ok: message to display if the user wants to try again cancel: message to display if the user wants to continue regardless """ + text: Translatable ok: Translatable cancel: Translatable @@ -83,13 +88,14 @@ def toDict(self): @dataclass class PropsUIPromptConsentFormTable: - """Table to be shown to the participant prior to donation + """Table to be shown to the participant prior to donation Attributes: id: a unique string to itentify the table after donation title: title of the table data_frame: table to be shown """ + id: str title: Translatable data_frame: pd.DataFrame @@ -102,14 +108,16 @@ def toDict(self): dict["data_frame"] = self.data_frame.to_json() return dict + @dataclass class PropsUIPromptConsentForm: - """Tables to be shown to the participant prior to donation + """Tables to be shown to the participant prior to donation Attributes: tables: a list of tables meta_tables: a list of optional tables, for example for logging data """ + tables: list[PropsUIPromptConsentFormTable] meta_tables: list[PropsUIPromptConsentFormTable] @@ -141,6 +149,7 @@ class PropsUIPromptFileInput: description: text with an explanation extensions: accepted mime types, example: "application/zip, text/plain" """ + description: Translatable extensions: str @@ -159,6 +168,7 @@ class RadioItem(TypedDict): id: id of radio button value: text to be displayed """ + id: int value: str @@ -174,6 +184,7 @@ class PropsUIPromptRadioInput: description: short description of the radio group items: a list of radio buttons """ + title: Translatable description: Translatable items: list[RadioItem] @@ -197,6 +208,7 @@ class PropsUIPageDonation: body: main body of the page, see the individual classes for an explanation footer: page footer """ + platform: str header: PropsUIHeader body: PropsUIPromptRadioInput | PropsUIPromptConsentForm | PropsUIPromptFileInput | PropsUIPromptConfirm @@ -214,6 +226,7 @@ def toDict(self): class PropsUIPageEnd: """An ending page to show the user they are done""" + def toDict(self): dict = {} dict["__type__"] = "PropsUIPageEnd" diff --git a/src/framework/processing/py/port/script.py b/src/framework/processing/py/port/script.py index c8fa219..684e3a3 100644 --- a/src/framework/processing/py/port/script.py +++ b/src/framework/processing/py/port/script.py @@ -1,21 +1,29 @@ import itertools import port.api.props as props -from port.api.commands import (CommandSystemDonate, CommandUIRender) +from port.api.commands import CommandSystemDonate, CommandUIRender import pandas as pd import zipfile import json import datetime -from collections import defaultdict +import fnmatch +from collections import defaultdict, namedtuple +from contextlib import suppress + ########################## # TikTok file processing # ########################## -filter_start = datetime.datetime(2021, 1, 1) +filter_start = datetime.datetime(1990, 1, 1) filter_end = datetime.datetime(2025, 1, 1) datetime_format = "%Y-%m-%d %H:%M:%S" + +def parse_datetime(value): + return datetime.datetime.fromtimestamp(value) + + def get_in(data_dict, *key_path): for k in key_path: data_dict = data_dict.get(k, None) @@ -23,21 +31,52 @@ def get_in(data_dict, *key_path): return None return data_dict -def get_video_list_data(data): - return get_in(data, "Activity", "Video Browsing History", "VideoList") + +def get_list(data_dict, *key_path): + result = get_in(data_dict, *key_path) + if result is None: + return [] + return result + + +def get_dict(data_dict, *key_path): + result = get_in(data_dict, *key_path) + if result is None: + return {} + return result + + +def get_string(data_dict, *key_path): + result = get_in(data_dict, *key_path) + if result is None: + return "" + return result + + +def cast_number(data_dict, *key_path): + value = get_in(data_dict, *key_path) + if value is None or value == "None": + return 0 + return value + + +def get_activity_video_browsing_list_data(data): + return get_list(data, "Activity", "Video Browsing History", "VideoList") + def get_comment_list_data(data): return get_in(data, "Comment", "Comments", "CommentsList") -def get_date_filtered_items(items): - for item in items: - timestamp =datetime.datetime.strptime(item["Date"], datetime_format) + +def filter_timestamps(timestamps): + for timestamp in timestamps: if timestamp < filter_start or timestamp > filter_end: continue - yield (timestamp, item) + yield timestamp + def get_count_by_date_key(timestamps, key_func): - """ Returns a list of tuples of the form (key, count) + """Returns a dict of the form (key, count) The key is determined by the key_func, which takes a datetime object and returns an object suitable for sorting and usage as a dictionary key. @@ -49,17 +88,35 @@ def get_count_by_date_key(timestamps, key_func): item_count[key_func(timestamp)] += 1 return sorted(item_count.items()) + def get_all_first(items): return (i[0] for i in items) + def hourly_key(date): - return date.strftime("%Y-%m-%d %H" ) + return date.replace(minute=0, second=0, microsecond=0) + def daily_key(date): - return date.strftime("%Y-%m-%d") + return date.date() + + +# ===================== +def glob(zipfile, pattern): + return fnmatch.filter(zipfile.namelist(), pattern) + + +def glob_json(zipfile, pattern): + for name in glob(zipfile, pattern): + with zipfile.open(name) as f: + yield json.load(f) + + +# ===================== + def get_sessions(timestamps): - """ Returns a list of tuples of the form (start, end, duration) + """Returns a list of tuples of the form (start, end, duration) The start and end are datetime objects, and the duration is a timedelta object. @@ -74,147 +131,485 @@ def get_sessions(timestamps): start = timestamps[0] end = timestamps[0] for prev, cur in zip(timestamps, timestamps[1:]): - if cur - prev > datetime.timedelta(hours=1): - sessions.append((start, end, end-start)) + if cur - prev > datetime.timedelta(minutes=5): + sessions.append((start, end, end - start)) start = cur end = cur - sessions.append((start, end, end-start)) + sessions.append((start, end, end - start)) return sessions -def get_json_data(zip_file): - with zipfile.ZipFile(zip_file, "r") as zip: - for name in zip.namelist(): - if not name.endswith(".json"): - continue - with zip.open(name) as json_file: - yield json.load(json_file) +def filtered_count(data, *key_path): + items = get_list(data, *key_path) + filtered_items = get_date_filtered_items(items) + return len(list(filtered_items)) -def extract_tiktok_data(zip_file): - for data in get_json_data(zip_file): - videos = list(get_all_first(get_date_filtered_items(get_video_list_data(data)))) - video_counts= get_count_by_date_key(videos, hourly_key) - table_title = props.Translatable({ - "en": "TikTok video browsing history", - "nl": "TikTok video geschiedenis" - }) - print(video_counts) - data_frame = pd.DataFrame(video_counts, columns=["Hour", "View Count"]) - return [props.PropsUIPromptConsentFormTable("tiktok_video_counts", table_title, data_frame)] +def get_chat_history(data): + return get_dict(data, "Direct Messages", "Chat History", "ChatHistory") - # comment_list_dates = list(get_all_first(get_date_filtered_items(get_comment_list_data(data)))) - # sessions = get_sessions(itertools.chain(video_dates, comment_list_dates)) - # yield sessions -# data = json.load(open(sys.argv[1])) +def flatten_chat_history(history): + return itertools.chain(*history.values()) -# from pprint import pprint -# video_dates = list(get_all_first(get_date_filtered_items(get_video_list_data(data)))) -# pprint(get_count_by_date_key(video_dates, hourly_key)) -# pprint(get_count_by_date_key(video_dates, daily_key)) -# print("#"*80) -# comment_list_dates = list(get_all_first(get_date_filtered_items(get_comment_list_data(data)))) -# pprint(get_count_by_date_key(comment_list_dates, hourly_key)) -# pprint(get_count_by_date_key(comment_list_dates, daily_key)) -# sessions = get_sessions(itertools.chain(video_dates, comment_list_dates)) -# pprint(sessions) +def filter_by_key(items, key, value): + return filter(lambda item: item[key] == value, items) + + +def exclude_by_key(items, key, value): + """ + Return a filtered list where items that match key & value are excluded. + """ + return filter(lambda item: item[key] != value, items) + + +def map_to_timeslot(series): + return series.map(lambda hour: f"{hour}-{hour+1}") + + +def count_items(zipfile, pattern, key): + return sum(len(data[key]) for data in glob_json(zipfile, pattern)) + + +def count_posts(zipfile): + return sum(len(data) for data in glob_json(zipfile, "content/posts_*.json")) + + +def count_messages(zipfile): + counts = {"sent": 0, "received": 0} + for data in glob_json(zipfile, "messages/inbox/**/message_*.json"): + donating_user = data["participants"][1]["name"] + for message in data["messages"]: + key = "sent" if message["sender_name"] == donating_user else "received" + counts[key] += 1 + return counts + + +def extract_summary_data(zipfile): + message_counts = count_messages(zipfile) + summary_data = { + "Description": [ + "Followers", + "Following", + "Posts", + "Comments posted", + "Videos watched", + "Posts viewed", + "Messages sent", + "Messages received", + "Ads viewed", + ], + "Number": [ + count_items( + zipfile, "followers_and_following/followers_*.json", "string_list_data" + ), + count_items( + zipfile, + "followers_and_following/following.json", + "relationships_following", + ), + count_posts(zipfile), + count_items( + zipfile, "comments/post_comments.json", "comments_media_comments" + ), + count_items( + zipfile, + "ads_and_topics/videos_watched.json", + "impressions_history_videos_watched", + ), + count_items( + zipfile, + "ads_and_topics/posts_viewed.json", + "impressions_history_posts_seen", + ), + message_counts["sent"], + message_counts["received"], + count_items( + zipfile, + "ads_and_topics/ads_viewed.json", + "impressions_history_ads_seen", + ), + ], + } + + return ExtractionResult( + "instagram_summary", + props.Translatable( + {"en": "Summary information", "nl": "Samenvatting gegevens"} + ), + pd.DataFrame(summary_data), + ) + + +def extract_direct_message_activity(zipfile): + counter = itertools.count() + person_ids = defaultdict(lambda: next(counter)) + sender_ids = [] + timestamps = [] + for data in glob_json(zipfile, "messages/inbox/**/message_*.json"): + # Ensure the donating user is the first to get an ID + donating_user = data["participants"][1]["name"] + person_ids[donating_user] + for message in data["messages"]: + sender_ids.append(person_ids[message["sender_name"]]) + timestamps.append(parse_datetime(message["timestamp_ms"] / 1000)) + df = pd.DataFrame({"Anonymous ID": sender_ids, "Sent": timestamps}) + df["Sent"] = df["Sent"].dt.strftime("%Y-%m-%d %H:%M") + return ExtractionResult( + "instagram_direct_message_activity", + props.Translatable( + {"en": "Direct message activity", "nl": "Bericht activiteit"} + ), + df, + ) + + +def extract_comment_activity(zipfile): + timestamps = [] + for data in glob_json(zipfile, "comments/post_comments.json"): + for item in data["comments_media_comments"]: + timestamps.append( + parse_datetime(item["string_map_data"]["Time"]["timestamp"]) + ) + df = pd.DataFrame({"Posted": timestamps}) + df = df.sort_values("Posted") + df["Posted"] = df["Posted"].dt.strftime("%Y-%m-%d %H:%M") + return ExtractionResult( + "instagram_comment_activity", + props.Translatable({"en": "Comment activity", "nl": "Commentaar activiteit"}), + df, + ) + + +def extract_posts_liked(zipfile): + urls = [] + timestamps = [] + for data in glob_json(zipfile, "likes/liked_posts.json"): + for item in data["likes_media_likes"]: + info = item["string_list_data"][0] + timestamps.append(parse_datetime(info["timestamp"])) + urls.append(info["href"]) + df = pd.DataFrame({"Liked": timestamps, "Link": urls}) + df["Liked"] = df["Liked"].dt.strftime("%Y-%m-%d %H:%M") + df = df.sort_values("Liked") + return ExtractionResult( + "instagram_posts_liked", + props.Translatable({"en": "Posts Liked", "nl": "Geliked"}), + df, + ) + + +def flatten_media(items): + for item in items: + yield from item["media"] + + +def get_creation_timestamps(items): + for item in items: + yield parse_datetime(item["creation_timestamp"]) + + +def get_media_creation_timestamps(items): + return get_creation_timestamps(flatten_media(items)) + + +def get_content_posts_timestamps(zipfile): + for data in glob_json(zipfile, "content/posts_*.json"): + yield from get_media_creation_timestamps(data) + + +def get_media_timestamps(zipfile, pattern, key): + for data in glob_json(zipfile, pattern): + yield from get_media_creation_timestamps(data[key]) + + +def df_from_timestamps(timestamps, column): + df = pd.DataFrame({"timestamps": timestamps}) + counts = df.groupby(lambda x: hourly_key(df["timestamps"][x])).size() + + df = counts.reset_index() + df.columns = ["timestamp", column] + return df + + +def stories_timestamps(zipfile): + for data in glob_json(zipfile, "content/stories.json"): + for item in data["ig_stories"]: + yield parse_datetime(item["creation_timestamp"]) + + +def df_from_timestamp_columns(a, b): + data_frames = [ + df_from_timestamps(timestamps, column) for timestamps, column in [a, b] + ] + + df = pd.merge( + data_frames[0], + data_frames[1], + left_on="timestamp", + right_on="timestamp", + how="outer", + ).sort_index() + df["Date"] = df["timestamp"].dt.strftime("%Y-%m-%d") + df["Timeslot"] = map_to_timeslot(df["timestamp"].dt.hour) + df = df.reset_index(drop=True) + df = ( + df.reindex(columns=["Date", "Timeslot", a[1], b[1]]) + .reset_index(drop=True) + .fillna(0) + ) + df[a[1]] = df[a[1]].astype(int) + df[b[1]] = df[b[1]].astype(int) + return df + + +def get_video_posts_timestamps(zipfile): + return itertools.chain( + get_content_posts_timestamps(zipfile), + get_media_timestamps(zipfile, "content/igtv_videos.json", "ig_igtv_media"), + get_media_timestamps(zipfile, "content/reels.json", "ig_reels_media"), + ) + + +def extract_video_posts(zipfile): + video_timestamps = get_video_posts_timestamps(zipfile) + df = df_from_timestamp_columns( + (video_timestamps, "Videos"), (stories_timestamps(zipfile), "Stories") + ) + return ExtractionResult( + "instagram_video_posts", + props.Translatable({"en": "Posts", "nl": "Posts"}), + df, + ) + + +def get_post_comments_timestamps(zipfile): + return get_string_map_timestamps( + zipfile, "comments/post_comments.json", "comments_media_comments" + ) + + +def get_string_list_timestamps(zipfile, pattern, key): + for data in glob_json(zipfile, pattern): + for item in data[key]: + yield parse_datetime(item["string_list_data"][0]["timestamp"]) + + +def get_string_map_timestamps(zipfile, pattern, key): + for data in glob_json(zipfile, pattern): + for item in data[key]: + yield parse_datetime(item["string_map_data"]["Time"]["timestamp"]) + + +def get_likes_timestamps(zipfile): + return itertools.chain( + get_string_list_timestamps( + zipfile, "likes/liked_comments.json", "likes_comment_likes" + ), + get_string_list_timestamps( + zipfile, "likes/liked_posts.json", "likes_media_likes" + ), + ) + + +def extract_comments_and_likes(zipfile): + comment_timestamps = get_post_comments_timestamps(zipfile) + likes_timestamps = get_likes_timestamps(zipfile) + df = df_from_timestamp_columns( + (comment_timestamps, "Comments"), (likes_timestamps, "Likes") + ) + return ExtractionResult( + "instagram_comments_and_likes", + props.Translatable({"en": "Comments and likes", "nl": "Comments en likes"}), + df, + ) + + +def extract_viewed(zipfile): + df = df_from_timestamp_columns( + ( + get_string_map_timestamps( + zipfile, + "ads_and_topics/videos_watched.json", + "impressions_history_videos_watched", + ), + "Videos", + ), + ( + get_string_map_timestamps( + zipfile, + "ads_and_topics/posts_viewed.json", + "impressions_history_posts_seen", + ), + "Posts", + ), + ) + return ExtractionResult( + "instagram_viewed", + props.Translatable({"en": "Viewed", "nl": "Viewed"}), + df, + ) + + +def extract_session_info(zipfile): + timestamps = list( + itertools.chain( + list(get_video_posts_timestamps(zipfile)), + list(stories_timestamps(zipfile)), + list(get_post_comments_timestamps(zipfile)), + list(get_likes_timestamps(zipfile)), + ) + ) + print(timestamps) + sessions = get_sessions(timestamps) + print(sessions) + df = pd.DataFrame(sessions, columns=["Start", "End", "Duration"]) + df["Start"] = df["Start"].dt.strftime("%Y-%m-%d %H:%M") + df["Duration (in minutes)"] = (df["Duration"].dt.total_seconds() / 60).round(2) + df = df.drop("End", axis=1) + df = df.drop("Duration", axis=1) + + return ExtractionResult( + "instagram_session_info", + props.Translatable({"en": "Session information", "nl": "Sessie informatie"}), + df, + ) + + +def extract_data(path): + extractors = [ + extract_summary_data, + extract_video_posts, + extract_comments_and_likes, + extract_viewed, + extract_session_info, + extract_direct_message_activity, + extract_comment_activity, + extract_posts_liked, + ] + zfile = zipfile.ZipFile(path) + print(zfile.namelist()) + return [extractor(zfile) for extractor in extractors] ###################### # Data donation flow # ###################### -def process_tiktok(sessionId): - progress = 0 - platform = "TikTok" - meta_data = [] - data = None - while True: - promptFile = prompt_file(platform, "application/zip, text/plain") - fileResult = yield render_donation_page(platform, promptFile, progress) - if fileResult.__type__ != 'PayloadString': - meta_data.append(("debug", f"{platform}: skip to next step")) - break - - meta_data.append(("debug", f"{platform}: extracting file")) - extractionResult = extract_tiktok_data(fileResult.value) - if extractionResult != 'invalid': - meta_data.append(("debug", f"{platform}: extraction successful, go to consent form")) - data = extractionResult - break - - meta_data.append(("debug", f"{platform}: prompt confirmation to retry file selection")) - retry_result = yield render_donation_page(platform, retry_confirmation(platform), progress) - if retry_result.__type__ == 'PayloadTrue': - meta_data.append(("debug", f"{platform}: skip due to invalid file")) - continue - meta_data.append(("debug", f"{platform}: retry prompt file")) - break - if data: - meta_data.append(("debug", f"{platform}: prompt consent")) - consent_result = yield render_donation_page(platform, props.PropsUIPromptConsentForm(data, []), progress) +ExtractionResult = namedtuple("ExtractionResult", ["id", "title", "data_frame"]) + + +class SkipToNextStep(Exception): + pass + + +class DataDonationProcessor: + def __init__(self, platform, mime_types, extractor, session_id): + self.platform = platform + self.mime_types = mime_types + self.extractor = extractor + self.session_id = session_id + self.progress = 0 + self.meta_data = [] + + def process(self): + print("START") + with suppress(SkipToNextStep): + while True: + file_result = yield from self.prompt_file() + + self.log(f"extracting file") + try: + print(file_result) + extraction_result = self.extract_data(file_result.value) + except IOError as e: + print("IOERROR") + self.log(f"prompt confirmation to retry file selection") + yield from self.prompt_retry() + return + else: + if extraction_result is None: + try_again = yield from self.prompt_retry() + if try_again: + continue + else: + return + self.log(f"extraction successful, go to consent form") + yield from self.prompt_consent(extraction_result) + + def prompt_retry(self): + retry_result = yield render_donation_page( + self.platform, retry_confirmation(self.platform), self.progress + ) + return retry_result.__type__ == "PayloadTrue" + + def prompt_file(self): + description = props.Translatable( + { + "en": f"Please follow the download instructions and choose the file that you stored on your device. Click “Skip” at the right bottom, if you do not have a {self.platform} file. ", + "nl": f"Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat. Als u geen {self.platform} bestand heeft klik dan op “Overslaan” rechts onder.", + } + ) + prompt_file = props.PropsUIPromptFileInput(description, self.mime_types) + file_result = yield render_donation_page( + self.platform, prompt_file, self.progress + ) + if file_result.__type__ != "PayloadString": + self.log(f"skip to next step") + raise SkipToNextStep() + return file_result + + def log(self, message): + self.meta_data.append(("debug", f"{self.platform}: {message}")) + + def extract_data(self, file): + return self.extractor(file) + + def prompt_consent(self, data): + log_title = props.Translatable({"en": "Log messages", "nl": "Log berichten"}) + + tables = [ + props.PropsUIPromptConsentFormTable(table.id, table.title, table.data_frame) + for table in data + ] + meta_frame = pd.DataFrame(self.meta_data, columns=["type", "message"]) + meta_table = props.PropsUIPromptConsentFormTable( + "log_messages", log_title, meta_frame + ) + self.log(f"prompt consent") + consent_result = yield render_donation_page( + self.platform, + props.PropsUIPromptConsentForm(tables, [meta_table]), + self.progress, + ) if consent_result.__type__ == "PayloadJSON": - meta_data.append(("debug", f"{platform}: donate consent data")) - yield donate(f"{sessionId}-{platform}", consent_result.value) + self.log(f"donate consent data") + yield donate(f"{self.sessionId}-{self.platform}", consent_result.value) -def process(sessionId): - progress = 0 - yield donate(f"{sessionId}-tracking", '[{ "message": "user entered script" }]') - yield from process_tiktok(sessionId) - - # subflows = len(platforms) - # steps = 2 - # step_percentage = (100/subflows)/steps - - # # progress in % - # progress = 0 - - # for index, platform in enumerate(platforms): - # meta_data = [] - # meta_data.append(("debug", f"{platform}: start")) - - # # STEP 1: select the file - # progress += step_percentage - # data = None - # while True: - # meta_data.append(("debug", f"{platform}: prompt file")) - # promptFile = prompt_file(platform, "application/zip, text/plain") - # fileResult = yield render_donation_page(platform, promptFile, progress) - # if fileResult.__type__ == 'PayloadString': - # meta_data.append(("debug", f"{platform}: extracting file")) - # extractionResult = doSomethingWithTheFile(platform, fileResult.value) - # if extractionResult != 'invalid': - # meta_data.append(("debug", f"{platform}: extraction successful, go to consent form")) - # data = extractionResult - # break - # else: - # meta_data.append(("debug", f"{platform}: prompt confirmation to retry file selection")) - # retry_result = yield render_donation_page(platform, retry_confirmation(platform), progress) - # if retry_result.__type__ == 'PayloadTrue': - # meta_data.append(("debug", f"{platform}: skip due to invalid file")) - # continue - # else: - # meta_data.append(("debug", f"{platform}: retry prompt file")) - # break - # else: - # meta_data.append(("debug", f"{platform}: skip to next step")) - # break - - # # STEP 2: ask for consent - # progress += step_percentage - # if data is not None: - # meta_data.append(("debug", f"{platform}: prompt consent")) - # prompt = prompt_consent(platform, data, meta_data) - # consent_result = yield render_donation_page(platform, prompt, progress) - # if consent_result.__type__ == "PayloadJSON": - # meta_data.append(("debug", f"{platform}: donate consent data")) - # yield donate(f"{sessionId}-{platform}", consent_result.value) +class DataDonation: + def __init__(self, platform, mime_types, extractor): + self.platform = platform + self.mime_types = mime_types + self.extractor = extractor + + def __call__(self, session_id): + processor = DataDonationProcessor( + self.platform, self.mime_types, self.extractor, session_id + ) + yield from processor.process() + +data_donation = DataDonation("Instagram", "application/zip", extract_data) + + +def process(session_id): + progress = 0 + yield donate(f"{session_id}-tracking", '[{ "message": "user entered script" }]') + yield from data_donation(session_id) yield render_end_page() @@ -224,10 +619,7 @@ def render_end_page(): def render_donation_page(platform, body, progress): - header = props.PropsUIHeader(props.Translatable({ - "en": platform, - "nl": platform - })) + header = props.PropsUIHeader(props.Translatable({"en": platform, "nl": platform})) footer = props.PropsUIFooter(progress) page = props.PropsUIPageDonation(platform, header, body, footer) @@ -235,66 +627,41 @@ def render_donation_page(platform, body, progress): def retry_confirmation(platform): - text = props.Translatable({ - "en": f"Unfortunately, we cannot process your {platform} file. Continue, if you are sure that you selected the right file. Try again to select a different file.", - "nl": f"Helaas, kunnen we uw {platform} bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen." - }) - ok = props.Translatable({ - "en": "Try again", - "nl": "Probeer opnieuw" - }) - cancel = props.Translatable({ - "en": "Continue", - "nl": "Verder" - }) + text = props.Translatable( + { + "en": f"Unfortunately, we cannot process your {platform} file. Continue, if you are sure that you selected the right file. Try again to select a different file.", + "nl": f"Helaas, kunnen we uw {platform} bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen.", + } + ) + ok = props.Translatable({"en": "Try again", "nl": "Probeer opnieuw"}) + cancel = props.Translatable({"en": "Continue", "nl": "Verder"}) return props.PropsUIPromptConfirm(text, ok, cancel) -def prompt_file(platform, extensions): - description = props.Translatable({ - "en": f"Please follow the download instructions and choose the file that you stored on your device. Click “Skip” at the right bottom, if you do not have a {platform} file. ", - "nl": f"Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat. Als u geen {platform} bestand heeft klik dan op “Overslaan” rechts onder." - }) - - return props.PropsUIPromptFileInput(description, extensions) - - -def doSomethingWithTheFile(platform, filename): - return extract_zip_contents(filename) - - -def extract_zip_contents(filename): - names = [] - try: - file = zipfile.ZipFile(filename) - data = [] - for name in file.namelist(): - names.append(name) - info = file.getinfo(name) - data.append((name, info.compress_size, info.file_size)) - return data - except zipfile.error: - return "invalid" - - def prompt_consent(id, data, meta_data): + table_title = props.Translatable( + {"en": "Zip file contents", "nl": "Inhoud zip bestand"} + ) - table_title = props.Translatable({ - "en": "Zip file contents", - "nl": "Inhoud zip bestand" - }) - - log_title = props.Translatable({ - "en": "Log messages", - "nl": "Log berichten" - }) + log_title = props.Translatable({"en": "Log messages", "nl": "Log berichten"}) data_frame = pd.DataFrame(data, columns=["filename", "compressed size", "size"]) table = props.PropsUIPromptConsentFormTable("zip_content", table_title, data_frame) meta_frame = pd.DataFrame(meta_data, columns=["type", "message"]) - meta_table = props.PropsUIPromptConsentFormTable("log_messages", log_title, meta_frame) + meta_table = props.PropsUIPromptConsentFormTable( + "log_messages", log_title, meta_frame + ) return props.PropsUIPromptConsentForm([table], [meta_table]) def donate(key, json_string): return CommandSystemDonate(key, json_string) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + print(extract_data(sys.argv[1])) + else: + print("please provide a zip file as argument") diff --git a/src/framework/processing/py/pyproject.toml b/src/framework/processing/py/pyproject.toml index 22c9375..71cd9e5 100644 --- a/src/framework/processing/py/pyproject.toml +++ b/src/framework/processing/py/pyproject.toml @@ -8,7 +8,8 @@ authors = ["Emiel van der Veen "] python = "^3.7" panda = "^0.3.1" -[tool.poetry.dev-dependencies] +[tool.poetry.group.test.dependencies] +pytest = "^7.4.2" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/framework/processing/py/tests/__init__.py b/src/framework/processing/py/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/framework/processing/py/tests/script_test.py b/src/framework/processing/py/tests/script_test.py new file mode 100644 index 0000000..35612f8 --- /dev/null +++ b/src/framework/processing/py/tests/script_test.py @@ -0,0 +1,351 @@ +""" +- test file type error +- test data not found error +- grouping by hour + - multiple columns + + +""" + +import json +import io +from pathlib import Path +from dataclasses import dataclass +from inspect import cleandoc +import pandas as pd +from pandas.testing import assert_frame_equal +from port.api import commands +from port import script + + +class FakeZip: + def __init__(self, files): + self._files = files + + def namelist(self): + return self._files.keys() + + def open(self, name): + data = self._files[name] + f = io.StringIO() + json.dump(data, f) + f.seek(0) + return f + + +def assert_frame_str_equal(df1, df2): + assert cleandoc(df1) == str(df2) + + +def test_summary_table(): + data = FakeZip( + { + "followers_and_following/followers_1.json": {"string_list_data": [{}, {}]}, + "followers_and_following/followers_2.json": {"string_list_data": [{}]}, + "followers_and_following/following.json": { + "relationships_following": [ + {}, + {}, + {}, + {}, + ] + }, + "content/posts_1.json": [ + {}, + {}, + {}, + {}, + {}, + ], + "comments/post_comments.json": {"comments_media_comments": [{}]}, + "ads_and_topics/videos_watched.json": { + "impressions_history_videos_watched": [{}, {}] + }, + "ads_and_topics/posts_viewed.json": { + "impressions_history_posts_seen": [{}, {}] + }, + "ads_and_topics/ads_viewed.json": { + "impressions_history_ads_seen": [{}, {}] + }, + "messages/inbox/some_person/message_1.json": { + "participants": [{"name": "Some"}, {"name": "Me"}], + "messages": [ + {"sender_name": "Me"}, + {"sender_name": "Some"}, + {"sender_name": "Me"}, + ], + }, + } + ) + result = script.extract_summary_data(data) + assert "instagram_summary" == result.id + assert "Summary information" == result.title.translations["en"] + + reference = """ + Description Number + 0 Followers 3 + 1 Following 4 + 2 Posts 5 + 3 Comments posted 1 + 4 Videos watched 2 + 5 Posts viewed 2 + 6 Messages sent 2 + 7 Messages received 1 + 8 Ads viewed 2 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +video_posts = { + "content/posts_1.json": [ + {"media": [{"creation_timestamp": 1678743234}]}, + {"media": [{"creation_timestamp": 1678752349}]}, + ], + "content/igtv_videos.json": { + "ig_igtv_media": [ + {"media": [{"creation_timestamp": 1678743235}]}, + {"media": [{"creation_timestamp": 1678752319}]}, + {"media": [{"creation_timestamp": 1678769988}]}, + ] + }, + "content/reels.json": { + "ig_reels_media": [ + {"media": [{"creation_timestamp": 1678752377}]}, + {"media": [{"creation_timestamp": 1678793248}]}, + ] + }, + "content/stories.json": { + "ig_stories": [ + {"creation_timestamp": 1678743234}, + ] + }, +} + + +def test_video_posts_table(): + data = FakeZip(video_posts) + result = script.extract_video_posts(data) + assert "instagram_video_posts" == result.id + assert "Posts" == result.title.translations["en"] + + reference = """ + Date Timeslot Videos Stories + 0 2023-03-13 22-23 2 1 + 1 2023-03-14 1-2 3 0 + 2 2023-03-14 5-6 1 0 + 3 2023-03-14 12-13 1 0 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +comments_data = { + "comments/post_comments.json": { + "comments_media_comments": [ + {"string_map_data": {"Time": {"timestamp": 1678743234}}}, + {"string_map_data": {"Time": {"timestamp": 1678752349}}}, + ] + }, + "likes/liked_comments.json": { + "likes_comment_likes": [{"string_list_data": [{"timestamp": 1678743446}]}], + }, + "likes/liked_posts.json": { + "likes_media_likes": [{"string_list_data": [{"timestamp": 1678743446}]}] + }, +} + + +def test_comments_and_likes_table(): + data = FakeZip(comments_data) + result = script.extract_comments_and_likes(data) + assert "instagram_comments_and_likes" == result.id + assert "Comments and likes" == result.title.translations["en"] + + reference = """ + Date Timeslot Comments Likes + 0 2023-03-13 22-23 1 2 + 1 2023-03-14 1-2 1 0 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +def test_viewed_table(): + data = FakeZip( + { + "ads_and_topics/videos_watched.json": { + "impressions_history_videos_watched": [ + {"string_map_data": {"Time": {"timestamp": 1678741258}}}, + {"string_map_data": {"Time": {"timestamp": 1678741258}}}, + ] + }, + "ads_and_topics/posts_viewed.json": { + "impressions_history_posts_seen": [ + {"string_map_data": {"Time": {"timestamp": 1678741258}}}, + {"string_map_data": {"Time": {"timestamp": 1678798788}}}, + ] + }, + }, + ) + result = script.extract_viewed(data) + assert "instagram_viewed" == result.id + assert "Viewed" == result.title.translations["en"] + + reference = """ + Date Timeslot Videos Posts + 0 2023-03-13 22-23 2 1 + 1 2023-03-14 13-14 0 1 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +def test_session_info_table(): + data = FakeZip({**video_posts, **comments_data}) + result = script.extract_session_info(data) + assert "instagram_session_info" == result.id + assert "Session information" == result.title.translations["en"] + + reference = """ + Start Duration (in minutes) + 0 2023-03-13 22:33 3.53 + 1 2023-03-14 01:05 0.97 + 2 2023-03-14 05:59 0.00 + 3 2023-03-14 12:27 0.00 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +def test_direct_message_activity_table(): + data = FakeZip( + { + "messages/inbox/some_person/message_1.json": { + "participants": [{"name": "Some"}, {"name": "Me"}], + "messages": [ + { + "sender_name": "Me", + "timestamp_ms": 1677493123321, + }, + { + "sender_name": "Some", + "timestamp_ms": 1677493127655, + }, + { + "sender_name": "Me", + "timestamp_ms": 1677493187671, + }, + ], + }, + "messages/inbox/some_other/message_1.json": { + "participants": [{"name": "Other"}, {"name": "Me"}], + "messages": [ + { + "sender_name": "Other", + "timestamp_ms": 1677493295441, + }, + { + "sender_name": "Me", + "timestamp_ms": 1677493299999, + }, + { + "sender_name": "Other", + "timestamp_ms": 1677493299999, + }, + ], + }, + }, + ) + result = script.extract_direct_message_activity(data) + assert "instagram_direct_message_activity" == result.id + assert "Direct message activity" == result.title.translations["en"] + + reference = """ + Anonymous ID Sent + 0 0 2023-02-27 11:18 + 1 1 2023-02-27 11:18 + 2 0 2023-02-27 11:19 + 3 2 2023-02-27 11:21 + 4 0 2023-02-27 11:21 + 5 2 2023-02-27 11:21 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +def test_comment_activity_table(): + data = FakeZip( + { + "comments/post_comments.json": { + "comments_media_comments": [ + {"string_map_data": {"Time": {"timestamp": 1678743434}}}, + {"string_map_data": {"Time": {"timestamp": 1678743478}}}, + {"string_map_data": {"Time": {"timestamp": 1678747777}}}, + {"string_map_data": {"Time": {"timestamp": 1678749999}}}, + {"string_map_data": {"Time": {"timestamp": 1678999999}}}, + ] + }, + }, + ) + result = script.extract_comment_activity(data) + assert "instagram_comment_activity" == result.id + assert "Comment activity" == result.title.translations["en"] + + reference = """ + Posted + 0 2023-03-13 22:37 + 1 2023-03-13 22:37 + 2 2023-03-13 23:49 + 3 2023-03-14 00:26 + 4 2023-03-16 21:53 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame) + + +def test_posts_liked_table(): + data = FakeZip( + { + "likes/liked_posts.json": { + "likes_media_likes": [ + { + "string_list_data": [ + { + "timestamp": 1678743446, + "href": "https://example.org/test1", + } + ] + }, + { + "string_list_data": [ + { + "timestamp": 1678743467, + "href": "https://example.org/test2", + } + ] + }, + { + "string_list_data": [ + { + "timestamp": 1678747777, + "href": "https://example.org/test3", + } + ] + }, + ] + }, + }, + ) + result = script.extract_posts_liked(data) + assert "instagram_posts_liked" == result.id + assert "Posts Liked" == result.title.translations["en"] + + reference = """ + Liked Link + 0 2023-03-13 22:37 https://example.org/test1 + 1 2023-03-13 22:37 https://example.org/test2 + 2 2023-03-13 23:49 https://example.org/test3 + """ + print(result.data_frame) + assert_frame_str_equal(reference, result.data_frame)