From 66bc1c91b623a27601cefa542789842e98f185eb Mon Sep 17 00:00:00 2001 From: NicolasGensollen Date: Wed, 31 Aug 2022 15:59:27 +0200 Subject: [PATCH] Propose solution for determining categorical variables automatically --- .../pipelines/statistics_surface/_inputs.py | 108 ++++++++++++++++++ .../pipelines/statistics_surface/_model.py | 3 +- .../statistics_surface/clinica_surfstat.py | 4 +- .../statistics_surface/test_model.py | 4 +- 4 files changed, 115 insertions(+), 4 deletions(-) diff --git a/clinica/pipelines/statistics_surface/_inputs.py b/clinica/pipelines/statistics_surface/_inputs.py index 755d07b310..36f0dd3db1 100644 --- a/clinica/pipelines/statistics_surface/_inputs.py +++ b/clinica/pipelines/statistics_surface/_inputs.py @@ -38,6 +38,114 @@ def _read_and_check_tsv_file(tsv_file: PathLike) -> pd.DataFrame: ) +def _enumerate_string_columns(df: pd.DataFrame) -> list: + """Returns a list of column names which dtypes are strings. + + Parameters + ---------- + df : pd.DataFrame + Pandas DataFrame to analyze. + + Returns + ------- + list : + List of string column names. + """ + return [c for c in df.columns if df[c].dtype == "string"] + + +def _unique_over_size_ratio(df: pd.DataFrame, column: str) -> float: + """Computes the ratio 'number of unique values / length of dataframe'. + + Parameters + ---------- + df : pd.DataFrame + DataFrame to analyze. + + column : str + Name of the column for which to compute the ratio. + + Returns + ------- + float : + The computed ratio for this column. + """ + return len(np.unique(df[column].values)) / len(df) + + +def _infer_categorical_columns(df: pd.DataFrame, threshold: float = 0.4) -> list: + """This is a heuristic to infer which columns of a dataframe are categorical. + + The function computes, for all string columns, the ratio of the number of unique + values over the size of the dataframe. If this ratio is less than the specified + threshold, then the column is assumed to be categorical, otherwise it is left as is. + + Parameters + ---------- + df : pd.DataFrame + DataFrame to analyze. + + threshold : float, optional + Threshold to decide if columns are categorical or not. Default=0.4. + + Returns + ------- + list : + The list of column names assumed to be categorical. + """ + if not 0 <= threshold <= 1: + raise ValueError(f"Threshold must be in [0, 1]. {threshold} was provided.") + return [ + c + for c in _enumerate_string_columns(df) + if _unique_over_size_ratio(df, c) < threshold + ] + + +def _categorize(df: pd.DataFrame) -> pd.DataFrame: + """Use the heuristic of _infer_categorical_columns and cast relevant columns to categories. + + Parameters + ---------- + df : pd.DataFrame + DataFrame to clean. + + Returns + ------- + pd.DataFrame : + Cleaned DataFrame. + """ + return df.apply( + lambda x: x.astype("category") + if x.name in _infer_categorical_columns(df) + else x + ) + + +def _read_tsv_file(tsv_file: PathLike) -> pd.DataFrame: + """Read the provided TSV file and performs a few cleaning steps to the DataFrame. + + That is, the TSV data must have columns 'participant_id' and 'session_id'. These + columns will be used to index the DataFrame. + + The columns containing string data will be automatically casted to proper categories + if the ratio of the number of unique values over the total size of the DataFrame is + smaller than a specified threshold (default=0.4). + + Parameters + ---------- + tsv_file : PathLike + Path to the TSV file to read. + + Returns + ------- + pd.DataFrame : + Resulting pandas DataFrame. + """ + df = _read_and_check_tsv_file(tsv_file).convert_dtypes() + return _categorize(df) + + def _get_t1_freesurfer_custom_file_template(base_dir: PathLike) -> str: """Returns a Template for the path to the desired surface file. diff --git a/clinica/pipelines/statistics_surface/_model.py b/clinica/pipelines/statistics_surface/_model.py index 78fe92e32b..ce20827a60 100644 --- a/clinica/pipelines/statistics_surface/_model.py +++ b/clinica/pipelines/statistics_surface/_model.py @@ -75,7 +75,8 @@ def _categorical_column(df: pd.DataFrame, column: str) -> bool: bool : `True` if the column contains categorical values, `False` otherwise. """ - return not df[column].dtype.name.startswith("float") + column_dtype = df.dtypes[column] + return column_dtype == "category" def _build_model(design_matrix: str, df: pd.DataFrame) -> FixedEffect: diff --git a/clinica/pipelines/statistics_surface/clinica_surfstat.py b/clinica/pipelines/statistics_surface/clinica_surfstat.py index 02b5d339c3..01fe282663 100644 --- a/clinica/pipelines/statistics_surface/clinica_surfstat.py +++ b/clinica/pipelines/statistics_surface/clinica_surfstat.py @@ -6,7 +6,7 @@ _build_thickness_array, _get_average_surface, _get_t1_freesurfer_custom_file_template, - _read_and_check_tsv_file, + _read_tsv_file, ) from ._model import create_glm_model @@ -135,7 +135,7 @@ def clinica_surfstat( The threshold to be used to declare clusters as significant. Default=0.05. """ # Load subjects data - df_subjects = _read_and_check_tsv_file(tsv_file) + df_subjects = _read_tsv_file(tsv_file) if surface_file is None: surface_file = _get_t1_freesurfer_custom_file_template(input_dir) thickness = _build_thickness_array(input_dir, surface_file, df_subjects, fwhm) diff --git a/test/unittests/pipelines/statistics_surface/test_model.py b/test/unittests/pipelines/statistics_surface/test_model.py index 57de384e3c..a17882f1bd 100644 --- a/test/unittests/pipelines/statistics_surface/test_model.py +++ b/test/unittests/pipelines/statistics_surface/test_model.py @@ -12,7 +12,9 @@ @pytest.fixture def df(): - return pd.read_csv(Path(CURRENT_DIR) / "data/subjects.tsv", sep="\t") + from clinica.pipelines.statistics_surface._inputs import _read_tsv_file + + return _read_tsv_file(Path(CURRENT_DIR) / "data/subjects.tsv") def test_missing_column_error(df):