Merge branch 'fetch-keepControl-data' into development-main

neurogeriatricskiel · Jul 18, 2024 · a980386 · a980386
2 parents 7bda92f + 0586e6b
commit a980386
Show file tree

Hide file tree

Showing 15 changed files with 1,747 additions and 721 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 projects/
 /ngmt.egg-info
+ngmt/datasets/_*
 
 my_messy_code/mytestconde.py
 

diff --git a/docs/datasets/mobilised.md b/docs/datasets/mobilised.md
@@ -1,7 +1,31 @@
 ## Mobilise-D
 
-The Mobilise-D dataset derived from the [Mobilise-D](https://mobilise-d.eu/) consortium and is a European project that aims to develop a comprehensive system to monitor and evaluate people's gait based on digital technologies, including sensors worn on the body, such as a low back-worn inertial measurement unit (IMU). Example data were made publicly available as Micó-Amigo *et al*., Zenodo, 2023, Assessing real-world gait with digital technology? Validation, insights and recommendations from the Mobilise-D consortium [[Data set]](https://zenodo.org/records/7547125), doi: [10.5281/zenodo.7547125](https://doi.org/10.5281/zenodo.7547125) and results for the entire dataset were published as Micó-Amigo *et al*., Journal of NeuroEngineering and Rehabilitation, 2023, Assessing real-world gait with digital technology? Validation, insights and recommendations from the Mobilise-D consortium, doi: [10.1186/s12984-023-01198-5](https://doi.org/10.1186/s12984-023-01198-5).
+The Mobilise-D dataset is derived from the [Mobilise-D](https://mobilise-d.eu/) consortium, which was a European project aimed at developing digital mobility outcomes to monitor the daily life gait of people with various mobility problems. For monitoring daily life gait a low back-worn inertial measurement unit (IMU) was used. 
+
+Example data were made publicly available as Micó-Amigo *et al*., Zenodo, 2023, Assessing real-world gait with digital technology? Validation, insights and recommendations from the Mobilise-D consortium [[Data set]](https://zenodo.org/records/7547125), doi: [10.5281/zenodo.7547125](https://doi.org/10.5281/zenodo.7547125) and results for the entire dataset were published as Micó-Amigo *et al*., Journal of NeuroEngineering and Rehabilitation, 2023, Assessing real-world gait with digital technology? Validation, insights and recommendations from the Mobilise-D consortium, doi: [10.1186/s12984-023-01198-5](https://doi.org/10.1186/s12984-023-01198-5).
+
+The example data can be fetched from the Zenodo repository, and data are saved in the following structure:
+```
+── ngmt/
+│   ├── datasets/
+│   │   ├── _mobilised/
+│   │   │   ├── CHF/
+│   │   │   │   ├── data.mat
+│   │   │   │   └── infoForAlgo.mat
+│   │   │   ├── ...
+│   │   │   ├── PFF/
+│   │   │   │   ├── data.mat
+│   │   │   │   └── infoForAlgo.mat
+│   ├── modules/
+│   ├── ...
+│   ├── __init__.py
+│   └── config.py
+│   ...
+```
+
+With for each of the cohorts (congestive heart failure (CHF), chronic obstructive pulmonary disease (COPD), healthy adult (HA), multiple sclerosis (MS), Parkinson’s disease (PD) and proximal femoral fracture (PFF)) an example data file (`data.mat`) and a file with information of the participant (`infoForAlgo.mat`). 
+
+The recording can be loaded into a `NGMTRecording` with the corresponding function:
 
-For this dataset a simple load function is provided to load the data into the NGMT dataclasses.
 
 ::: datasets.mobilised
diff --git a/examples/basic_00_intro_ngmt.ipynb b/examples/basic_00_intro_ngmt.ipynb
diff --git a/examples/basic_01_load_Data.ipynb b/examples/basic_01_load_Data.ipynb
diff --git a/examples/modules_01_gsd.ipynb b/examples/modules_01_gsd.ipynb
diff --git a/examples/modules_02_icd.ipynb b/examples/modules_02_icd.ipynb
diff --git a/examples/modules_02_icd_keepControl.ipynb b/examples/modules_02_icd_keepControl.ipynb
diff --git a/ngmt/datasets/__pycache__/keepcontrol.cpython-311.pyc b/ngmt/datasets/__pycache__/keepcontrol.cpython-311.pyc
diff --git a/ngmt/datasets/__pycache__/mobilised.cpython-311.pyc b/ngmt/datasets/__pycache__/mobilised.cpython-311.pyc
diff --git a/ngmt/datasets/keepcontrol.py b/ngmt/datasets/keepcontrol.py
@@ -1,88 +1,149 @@
 import numpy as np
 import pandas as pd
-import pathlib
-import os
+from pathlib import Path
+import openneuro
+from typing import Union, Optional
 from ngmt.utils.ngmt_dataclass import NGMTRecording
 from ngmt.utils.ngmt_dataclass import REQUIRED_COLUMNS
 
+# Dict of valid tracked points for the Keep Control dataset for each tracking system
+VALID_TRACKED_POINTS = {
+    "omc": [
+        'l_toe', 'l_heel', 'l_ank', 'l_sk1', 'l_sk2', 'l_sk3', 'l_sk4', 'l_th1', 'l_th2',
+        'l_th3', 'l_th4', 'r_toe', 'r_heel', 'r_ank', 'r_sk1', 'r_sk2', 'r_sk3', 'r_sk4',
+        'r_th1', 'r_th2', 'r_th3', 'r_th4', 'l_asis', 'r_asis', 'l_psis', 'r_psis',
+        'm_ster1', 'm_ster2', 'm_ster3', 'l_sho', 'l_ua', 'l_elbl', 'l_frm', 'l_wrr',
+        'l_wru', 'l_hand', 'r_sho', 'r_ua', 'r_elbl', 'r_frm', 'r_wrr', 'r_wru', 'r_hand',
+        'lf_hd', 'rf_hd', 'lb_hd', 'rb_hd', 'start_1', 'start_2', 'end_1', 'end_2'
+    ],
+    "imu": [
+        'head', 'sternum', 'left_upper_arm', 'left_fore_arm', 'right_upper_arm',
+        'right_fore_arm', 'pelvis', 'left_thigh', 'left_shank', 'left_foot',
+        'right_thigh', 'right_shank', 'right_foot', 'left_ankle', 'right_ankle'
+    ]
+}
 
+def fetch_dataset(
+    dataset_path: str | Path = Path(__file__).parent / "_keepcontrol",
+) -> None:
+    """Fetch the Keep Control dataset from the OpenNeuro repository.
+
+    Args:
+        progressbar (bool, optional): Whether to display a progressbar. Defaults to True.
+        dataset_path (str | Path, optional): The path where the dataset is stored. Defaults to Path(__file__).parent/"_keepcontrol".
+    """
+    dataset_path = Path(dataset_path) if isinstance(dataset_path, str) else dataset_path
+
+    # Check if target folder exists, if not create it
+    if not dataset_path.exists():
+        dataset_path.parent.joinpath("_keepcontrol").mkdir(parents=True, exist_ok=True)
+
+    # check if the dataset has already been downloaded (directory is not empty), if not download it
+    if not any(dataset_path.iterdir()):
+        # Download the dataset using openneuro-py
+        openneuro.download(
+            dataset="ds005258", # this is the example Keep Control dataset on OpenNeuro, maintained by Julius Welzel
+            target_dir=dataset_path,
+        )
+
+        print(f"Dataset has been downloaded to {dataset_path}")
+
+    # print message to user if dataset has already been downloaded
+    else:
+        print(f"Dataset has already been downloaded to {dataset_path}")
+
+    return
+
 def load_recording(
-    file_name: pathlib.Path,
-    tracking_systems: str | list[str],
-    tracked_points: str | list[str] | dict[str, str] | dict[str, list[str]],
+    dataset_path: str | Path = Path(__file__).parent / "_keepcontrol",
+    id: str = "pp001",
+    task: str = "walkSlow",
+    tracking_systems: Union[str, list[str]] = ["imu", "omc"],
+    tracked_points: Optional[Union[None, str, list[str]]] = None,
 ):
     """
     Load a recording from the Keep Control validation study.
 
     Args:
-        file_name (pathlib.Path): The absolute or relative path to the data file.
-        tracking_systems (str or list of str) : A string or list of strings of tracking systems for which data are to be returned.
-        tracked_points (str or list of str or dict[str, str] or dict[str, list of str]) :
-            Defines for which tracked points data are to be returned.
-            If a string or list of strings is provided, then these will be mapped to each requested tracking system.
-            If a dictionary is provided, it should map each tracking system to either a single tracked point or a list of tracked points.
+        dataset_path (str or Path, optional): The path to the dataset. Defaults to the "_keepcontrol" directory in the same directory as this file.
+        id (str): The ID of the recording.
+        tracking_systems (str or list of str): A string or list of strings representing the tracking systems for which data should be returned.
+        tracked_points (None, str or list of str, optional): The tracked points of interest. If None, all tracked points will be returned. Defaults to None.
 
     Returns:
-        NGMTRecording : An instance of the NGMTRecording dataclass containing the loaded data and channels.
+        NGMTRecording: An instance of the NGMTRecording dataclass containing the loaded data and channels.
     """
+
+    # Fetch the dataset if it does not exist
+    if not dataset_path.exists():
+        fetch_dataset()
+
+    # check if id contains sub or sub- substring, if so replace it with ''
+    id = id.replace("sub", "").replace("-", "")
+
+    # check if task contains task or task- substring, if so replace it with ''
+    task = task.replace("task", "").replace("-", "")
+
+
     # Put tracking systems in a list
     if isinstance(tracking_systems, str):
         tracking_systems = [tracking_systems]
 
+    # check if tracked points has been specified, if not use all tracked points
+    if tracked_points is None:
+        tracked_points = {tracksys: VALID_TRACKED_POINTS[tracksys] for tracksys in tracking_systems}
     # Tracked points will be a dictionary mapping
     # each tracking system to a list of tracked points of interest
     if isinstance(tracked_points, str):
         tracked_points = [tracked_points]
     if isinstance(tracked_points, list):
         tracked_points = {tracksys: tracked_points for tracksys in tracking_systems}
-    for k, v in tracked_points.items():
-        if isinstance(v, str):
-            tracked_points[k] = [v]
+    # use the VALID_TRACKED_POINTS dictionary to get the valid tracked points for each tracking system
+    # return error of some tracked_points are not valid
+    # print which of the specified tracked points are not valid
+    for tracksys in tracking_systems:
+        if not all(tracked_point in VALID_TRACKED_POINTS[tracksys] for tracked_point in tracked_points[tracksys]):
+            print(f"Invalid tracked points for tracking system {tracksys}.")
+            print(f"Valid tracked points are: {VALID_TRACKED_POINTS[tracksys]}")
+            invalid_points = [tracked_point for tracked_point in tracked_points[tracksys] if tracked_point not in VALID_TRACKED_POINTS[tracksys]]
+            print(f"Invalid tracked points are: {invalid_points}")
+            return
 
-    # From the file_name, extract the tracking system
-    search_str = "_tracksys-"
-    idx_from = str(file_name).find(search_str) + len(search_str)
-    idx_to = idx_from + str(file_name)[idx_from:].find("_")
-    current_tracksys = str(file_name)[idx_from:idx_to]
 
-    # Initialize the data and channels dictionaroes
+    # Load data and channels for each tracking system
     data_dict, channels_dict = {}, {}
     for tracksys in tracking_systems:
-        # Set current filename
-        current_file_name = str(file_name).replace(
-            f"{search_str}{current_tracksys}", f"{search_str}{tracksys}"
-        )
-        if os.path.isfile(current_file_name):
-            # Read the data and channels info into a pandas DataFrame
-            df_data = pd.read_csv(current_file_name, header=0, sep="\t")
-            df_channels = pd.read_csv(
-                current_file_name.replace("_motion.tsv", "_channels.tsv"),
-                header=0,
-                sep="\t",
-            )
-
-            # Now select only for the tracked points of interest
-            df_data = df_data.loc[
-                :,
-                [
-                    col
-                    for col in df_data.columns
-                    if any(
-                        [
-                            tracked_point in col
-                            for tracked_point in tracked_points[tracksys]
-                        ]
-                    )
-                ],
-            ]
-            df_channels = df_channels[
-                (df_channels["tracked_point"].isin(tracked_points[tracksys]))
-            ]
-
-            # Put data and channels in output dictionaries
-            col_names = [c for c in REQUIRED_COLUMNS] + [
-                c for c in df_channels.columns if c not in REQUIRED_COLUMNS
-            ]
-            data_dict[tracksys] = df_data
-            channels_dict[tracksys] = df_channels[col_names]
-    return NGMTRecording(data=data_dict, channels=channels_dict)
+        # Find avaliable files for the give ID and task and tracking system
+        file_name = list(dataset_path.glob(f"sub-{id}/motion/sub-{id}_task-{task}_tracksys-{tracksys}_*motion.tsv"))
+        # check if file exists, if not print message to user and return
+        if not file_name:
+            print(f"No files found for ID {id}, task {task}, and tracking system {tracksys}.")
+            return
+        # check if multiple files are found, if so print message to user and return
+        if len(file_name) > 1:
+            print(f"Multiple files found for ID {id}, task {task}, and tracking system {tracksys}.")
+            return
+
+        # Load the data and channels for the tracking system
+        df_data = pd.read_csv(file_name[0], sep="\t")
+        df_channels = pd.read_csv(file_name[0].parent / f"sub-{id}_task-{task}_tracksys-{tracksys}_channels.tsv", sep="\t")
+
+        # filter the data and channels to only include the tracked points of interest
+        df_channels = df_channels[df_channels["tracked_point"].isin(tracked_points[tracksys])]
+
+        # only keep df_data columns that are in df_channels
+        col_names = df_channels["name"].values
+        df_data = df_data[col_names]        
+
+        # transform the data and channels into a dictionary for the NGMTRecording dataclass
+        data_dict[tracksys] = df_data
+        channels_dict[tracksys] = df_channels
+
+    # construct data class
+    recording = NGMTRecording(data=data_dict, channels=channels_dict)
+
+    # add information about the recording to the data class
+    recording.add_info("Subject", id)
+    recording.add_info("Task", task)
+
+    return recording