From 66bc1c91b623a27601cefa542789842e98f185eb Mon Sep 17 00:00:00 2001
From: NicolasGensollen <nicolas.gensollen@gmail.com>
Date: Wed, 31 Aug 2022 15:59:27 +0200
Subject: [PATCH] Propose solution for determining categorical variables
 automatically

---
 .../pipelines/statistics_surface/_inputs.py   | 108 ++++++++++++++++++
 .../pipelines/statistics_surface/_model.py    |   3 +-
 .../statistics_surface/clinica_surfstat.py    |   4 +-
 .../statistics_surface/test_model.py          |   4 +-
 4 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/clinica/pipelines/statistics_surface/_inputs.py b/clinica/pipelines/statistics_surface/_inputs.py
index 755d07b310..36f0dd3db1 100644
--- a/clinica/pipelines/statistics_surface/_inputs.py
+++ b/clinica/pipelines/statistics_surface/_inputs.py
@@ -38,6 +38,114 @@ def _read_and_check_tsv_file(tsv_file: PathLike) -> pd.DataFrame:
         )
 
 
+def _enumerate_string_columns(df: pd.DataFrame) -> list:
+    """Returns a list of column names which dtypes are strings.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Pandas DataFrame to analyze.
+
+    Returns
+    -------
+    list :
+        List of string column names.
+    """
+    return [c for c in df.columns if df[c].dtype == "string"]
+
+
+def _unique_over_size_ratio(df: pd.DataFrame, column: str) -> float:
+    """Computes the ratio 'number of unique values / length of dataframe'.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame to analyze.
+
+    column : str
+        Name of the column for which to compute the ratio.
+
+    Returns
+    -------
+    float :
+        The computed ratio for this column.
+    """
+    return len(np.unique(df[column].values)) / len(df)
+
+
+def _infer_categorical_columns(df: pd.DataFrame, threshold: float = 0.4) -> list:
+    """This is a heuristic to infer which columns of a dataframe are categorical.
+
+    The function computes, for all string columns, the ratio of the number of unique
+    values over the size of the dataframe. If this ratio is less than the specified
+    threshold, then the column is assumed to be categorical, otherwise it is left as is.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame to analyze.
+
+    threshold : float, optional
+        Threshold to decide if columns are categorical or not. Default=0.4.
+
+    Returns
+    -------
+    list :
+        The list of column names assumed to be categorical.
+    """
+    if not 0 <= threshold <= 1:
+        raise ValueError(f"Threshold must be in [0, 1]. {threshold} was provided.")
+    return [
+        c
+        for c in _enumerate_string_columns(df)
+        if _unique_over_size_ratio(df, c) < threshold
+    ]
+
+
+def _categorize(df: pd.DataFrame) -> pd.DataFrame:
+    """Use the heuristic of _infer_categorical_columns and cast relevant columns to categories.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame to clean.
+
+    Returns
+    -------
+    pd.DataFrame :
+        Cleaned DataFrame.
+    """
+    return df.apply(
+        lambda x: x.astype("category")
+        if x.name in _infer_categorical_columns(df)
+        else x
+    )
+
+
+def _read_tsv_file(tsv_file: PathLike) -> pd.DataFrame:
+    """Read the provided TSV file and performs a few cleaning steps to the DataFrame.
+
+    That is, the TSV data must have columns 'participant_id' and 'session_id'. These
+    columns will be used to index the DataFrame.
+
+    The columns containing string data will be automatically casted to proper categories
+    if the ratio of the number of unique values over the total size of the DataFrame is
+    smaller than a specified threshold (default=0.4).
+
+    Parameters
+    ----------
+    tsv_file : PathLike
+        Path to the TSV file to read.
+
+    Returns
+    -------
+    pd.DataFrame :
+        Resulting pandas DataFrame.
+    """
+    df = _read_and_check_tsv_file(tsv_file).convert_dtypes()
+    return _categorize(df)
+
+
 def _get_t1_freesurfer_custom_file_template(base_dir: PathLike) -> str:
     """Returns a Template for the path to the desired surface file.
 
diff --git a/clinica/pipelines/statistics_surface/_model.py b/clinica/pipelines/statistics_surface/_model.py
index 78fe92e32b..ce20827a60 100644
--- a/clinica/pipelines/statistics_surface/_model.py
+++ b/clinica/pipelines/statistics_surface/_model.py
@@ -75,7 +75,8 @@ def _categorical_column(df: pd.DataFrame, column: str) -> bool:
     bool :
         `True` if the column contains categorical values, `False` otherwise.
     """
-    return not df[column].dtype.name.startswith("float")
+    column_dtype = df.dtypes[column]
+    return column_dtype == "category"
 
 
 def _build_model(design_matrix: str, df: pd.DataFrame) -> FixedEffect:
diff --git a/clinica/pipelines/statistics_surface/clinica_surfstat.py b/clinica/pipelines/statistics_surface/clinica_surfstat.py
index 02b5d339c3..01fe282663 100644
--- a/clinica/pipelines/statistics_surface/clinica_surfstat.py
+++ b/clinica/pipelines/statistics_surface/clinica_surfstat.py
@@ -6,7 +6,7 @@
     _build_thickness_array,
     _get_average_surface,
     _get_t1_freesurfer_custom_file_template,
-    _read_and_check_tsv_file,
+    _read_tsv_file,
 )
 from ._model import create_glm_model
 
@@ -135,7 +135,7 @@ def clinica_surfstat(
         The threshold to be used to declare clusters as significant. Default=0.05.
     """
     # Load subjects data
-    df_subjects = _read_and_check_tsv_file(tsv_file)
+    df_subjects = _read_tsv_file(tsv_file)
     if surface_file is None:
         surface_file = _get_t1_freesurfer_custom_file_template(input_dir)
     thickness = _build_thickness_array(input_dir, surface_file, df_subjects, fwhm)
diff --git a/test/unittests/pipelines/statistics_surface/test_model.py b/test/unittests/pipelines/statistics_surface/test_model.py
index 57de384e3c..a17882f1bd 100644
--- a/test/unittests/pipelines/statistics_surface/test_model.py
+++ b/test/unittests/pipelines/statistics_surface/test_model.py
@@ -12,7 +12,9 @@
 
 @pytest.fixture
 def df():
-    return pd.read_csv(Path(CURRENT_DIR) / "data/subjects.tsv", sep="\t")
+    from clinica.pipelines.statistics_surface._inputs import _read_tsv_file
+
+    return _read_tsv_file(Path(CURRENT_DIR) / "data/subjects.tsv")
 
 
 def test_missing_column_error(df):