Skip to content

Commit

Permalink
Propose solution for determining categorical variables automatically
Browse files Browse the repository at this point in the history
  • Loading branch information
NicolasGensollen committed Aug 31, 2022
1 parent b225b69 commit 66bc1c9
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 4 deletions.
108 changes: 108 additions & 0 deletions clinica/pipelines/statistics_surface/_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,114 @@ def _read_and_check_tsv_file(tsv_file: PathLike) -> pd.DataFrame:
)


def _enumerate_string_columns(df: pd.DataFrame) -> list:
"""Returns a list of column names which dtypes are strings.
Parameters
----------
df : pd.DataFrame
Pandas DataFrame to analyze.
Returns
-------
list :
List of string column names.
"""
return [c for c in df.columns if df[c].dtype == "string"]


def _unique_over_size_ratio(df: pd.DataFrame, column: str) -> float:
"""Computes the ratio 'number of unique values / length of dataframe'.
Parameters
----------
df : pd.DataFrame
DataFrame to analyze.
column : str
Name of the column for which to compute the ratio.
Returns
-------
float :
The computed ratio for this column.
"""
return len(np.unique(df[column].values)) / len(df)


def _infer_categorical_columns(df: pd.DataFrame, threshold: float = 0.4) -> list:
"""This is a heuristic to infer which columns of a dataframe are categorical.
The function computes, for all string columns, the ratio of the number of unique
values over the size of the dataframe. If this ratio is less than the specified
threshold, then the column is assumed to be categorical, otherwise it is left as is.
Parameters
----------
df : pd.DataFrame
DataFrame to analyze.
threshold : float, optional
Threshold to decide if columns are categorical or not. Default=0.4.
Returns
-------
list :
The list of column names assumed to be categorical.
"""
if not 0 <= threshold <= 1:
raise ValueError(f"Threshold must be in [0, 1]. {threshold} was provided.")
return [
c
for c in _enumerate_string_columns(df)
if _unique_over_size_ratio(df, c) < threshold
]


def _categorize(df: pd.DataFrame) -> pd.DataFrame:
"""Use the heuristic of _infer_categorical_columns and cast relevant columns to categories.
Parameters
----------
df : pd.DataFrame
DataFrame to clean.
Returns
-------
pd.DataFrame :
Cleaned DataFrame.
"""
return df.apply(
lambda x: x.astype("category")
if x.name in _infer_categorical_columns(df)
else x
)


def _read_tsv_file(tsv_file: PathLike) -> pd.DataFrame:
"""Read the provided TSV file and performs a few cleaning steps to the DataFrame.
That is, the TSV data must have columns 'participant_id' and 'session_id'. These
columns will be used to index the DataFrame.
The columns containing string data will be automatically casted to proper categories
if the ratio of the number of unique values over the total size of the DataFrame is
smaller than a specified threshold (default=0.4).
Parameters
----------
tsv_file : PathLike
Path to the TSV file to read.
Returns
-------
pd.DataFrame :
Resulting pandas DataFrame.
"""
df = _read_and_check_tsv_file(tsv_file).convert_dtypes()
return _categorize(df)


def _get_t1_freesurfer_custom_file_template(base_dir: PathLike) -> str:
"""Returns a Template for the path to the desired surface file.
Expand Down
3 changes: 2 additions & 1 deletion clinica/pipelines/statistics_surface/_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def _categorical_column(df: pd.DataFrame, column: str) -> bool:
bool :
`True` if the column contains categorical values, `False` otherwise.
"""
return not df[column].dtype.name.startswith("float")
column_dtype = df.dtypes[column]
return column_dtype == "category"


def _build_model(design_matrix: str, df: pd.DataFrame) -> FixedEffect:
Expand Down
4 changes: 2 additions & 2 deletions clinica/pipelines/statistics_surface/clinica_surfstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
_build_thickness_array,
_get_average_surface,
_get_t1_freesurfer_custom_file_template,
_read_and_check_tsv_file,
_read_tsv_file,
)
from ._model import create_glm_model

Expand Down Expand Up @@ -135,7 +135,7 @@ def clinica_surfstat(
The threshold to be used to declare clusters as significant. Default=0.05.
"""
# Load subjects data
df_subjects = _read_and_check_tsv_file(tsv_file)
df_subjects = _read_tsv_file(tsv_file)
if surface_file is None:
surface_file = _get_t1_freesurfer_custom_file_template(input_dir)
thickness = _build_thickness_array(input_dir, surface_file, df_subjects, fwhm)
Expand Down
4 changes: 3 additions & 1 deletion test/unittests/pipelines/statistics_surface/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@

@pytest.fixture
def df():
return pd.read_csv(Path(CURRENT_DIR) / "data/subjects.tsv", sep="\t")
from clinica.pipelines.statistics_surface._inputs import _read_tsv_file

return _read_tsv_file(Path(CURRENT_DIR) / "data/subjects.tsv")


def test_missing_column_error(df):
Expand Down

0 comments on commit 66bc1c9

Please sign in to comment.