MAINT change `clinicalù variable name to metadata

owkin · Jul 31, 2023 · f346162 · f346162
1 parent 51f2eae
commit f346162
Show file tree

Hide file tree

Showing 10 changed files with 154 additions and 170 deletions.
diff --git a/datasets/synthetic/test_clinical.csv → datasets/synthetic/test_metadata.csv b/datasets/synthetic/test_clinical.csv → datasets/synthetic/test_metadata.csv
diff --git a/examples/plot_minimal_pydeseq2_pipeline.py b/examples/plot_minimal_pydeseq2_pipeline.py
@@ -36,7 +36,7 @@
 #
 #   * A count matrix of shape 'number of samples' x 'number of genes', containing
 #     read counts (non-negative integers),
-#   * Clinical data (or "column" data) of shape 'number of samples' x
+#   * Metadata (or "column" data) of shape 'number of samples' x
 #     'number of variables', containing sample annotations that will be used
 #     to split the data in cohorts.
 #
@@ -55,20 +55,20 @@
     debug=False,
 )
 
-clinical_df = load_example_data(
-    modality="clinical",
+metadata = load_example_data(
+    modality="metadata",
     dataset="synthetic",
     debug=False,
 )
 
 print(counts_df)
 
 # %%
-print(clinical_df)
+print(metadata)
 
 
 # %%
-# In this example, the clinical data contains two columns, ``condition`` and ``group``,
+# In this example, the metadata data contains two columns, ``condition`` and ``group``,
 # representing two types of bi-level annotations. In the first part, we will only use the
 # ``condition`` factor. Later on, we'll see how to use both the `condition` and the
 # ``group`` factors in our analysis (see :ref:`multifactor_ref`).
@@ -83,12 +83,12 @@
 # this step if you are using real data. To this end you can use the code below.
 #
 # We start by removing samples for which ``condition`` is ``NaN``. If you are using
-# another dataset, do not forget to change "condition" for the column of ``clinical_df``
+# another dataset, do not forget to change "condition" for the column of ``metadata``
 # you wish to use as a design factor in your analysis.
 
-samples_to_keep = ~clinical_df.condition.isna()
+samples_to_keep = ~metadata.condition.isna()
 counts_df = counts_df.loc[samples_to_keep]
-clinical_df = clinical_df.loc[samples_to_keep]
+metadata = metadata.loc[samples_to_keep]
 
 # %%
 # .. note::
@@ -123,32 +123,32 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We start by creating a :class:`DeseqDataSet`
-# object from the count and clinical data.
+# object from the count and metadata data.
 # A :class:`DeseqDataSet` fits dispersion and
 # log-fold change (LFC) parameters from the data, and stores them.
 #
 
 dds = DeseqDataSet(
     counts=counts_df,
-    clinical=clinical_df,
+    metadata=metadata,
     design_factors="condition",
     refit_cooks=True,
     n_cpus=8,
 )
 
 # %%
 # A :class:`DeseqDataSet` has two mandatory
-# arguments: a ``counts`` and a ``clinical`` dataframe, like the ones we've loaded in the
+# arguments: a ``counts`` and a ``metadata`` dataframe, like the ones we've loaded in the
 # first part of this tutorial.
 #
-# Next, we should specify the ``design_factor``, i.e. the column of the ``clinical``
+# Next, we should specify the ``design_factor``, i.e. the column of the ``metadata``
 # dataframe that will be used to compare samples. This can be a single string as above,
 # or a list of strings, as in the
 # :ref:`section on multifactor analysis<multifactor_ref>`.
 #
 # .. note::
 #   The ``"condition"`` argument passed to ``design_factors`` corresponds to a column
-#   from the ``clinical_df`` dataframe we loaded earlier.
+#   from the ``metadata`` dataframe we loaded earlier.
 #   You might need to change it according to your own dataset.
 #
 # Several other arguments may be optionally specified (see the :doc:`API documentation
@@ -283,11 +283,11 @@
 #
 # .. currentmodule:: pydeseq2.dds
 #
-# So far, we have only used the ``condition`` column of ``clinical_df``, which divides
-# samples between conditions ``A`` and ``B``. Yet, ``clinical_df`` contains second
+# So far, we have only used the ``condition`` column of ``metadata``, which divides
+# samples between conditions ``A`` and ``B``. Yet, ``metadata`` contains second
 # column, which separates samples according to ``group`` ``X`` and ``Y``.
 
-print(clinical_df)
+print(metadata)
 
 # %%
 # The goal of multifactor analysis is to use *both* variables to fit LFCs.
@@ -302,7 +302,7 @@
 
 dds = DeseqDataSet(
     counts=counts_df,
-    clinical=clinical_df,
+    metadata=metadata,
     design_factors=["group", "condition"],
     refit_cooks=True,
     n_cpus=8,

diff --git a/examples/plot_pandas_io_example.py b/examples/plot_pandas_io_example.py
@@ -39,7 +39,7 @@
 #
 #   * A count matrix of shape 'number of samples' x 'number of genes', containing
 #     read counts (non-negative integers),
-#   * Clinical data (or annotations, or "column" data) of shape 'number of samples' x
+#   * Metadata (or annotations, or "column" data) of shape 'number of samples' x
 #     'number of variables', containing sample annotations that will be used
 #     to split the data in cohorts.
 #
@@ -53,7 +53,7 @@
 # <https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas.read_csv>`_.
 #
 # We assume that ``DATA_PATH`` is a directory containing a ``test_counts.csv`` and a
-# ``test_clinical.csv`` file.
+# ``test_metadata.csv`` file.
 
 # Replace this with the path to your dataset
 DATA_PATH = "https://raw.githubusercontent.com/owkin/PyDESeq2/main/datasets/synthetic/"
@@ -71,12 +71,12 @@
 print(counts_df)
 
 # %% Loading annotations
-clinical_df = pd.read_csv(os.path.join(DATA_PATH, "test_clinical.csv"), index_col=0)
-print(clinical_df)
+metadata = pd.read_csv(os.path.join(DATA_PATH, "test_metadata.csv"), index_col=0)
+print(metadata)
 
 
 # %%
-# In this example, the clinical data contains two columns, ``condition`` and ``group``,
+# In this example, the metadata data contains two columns, ``condition`` and ``group``,
 # representing two types of bi-level annotations. Here, we will only use the
 # ``condition`` factor.
 
@@ -87,9 +87,9 @@
 # Before proceeding with DEA, we start by preprocessing the data, as in the
 # :doc:`getting started example <plot_minimal_pydeseq2_pipeline>`.
 
-samples_to_keep = ~clinical_df.condition.isna()
+samples_to_keep = ~metadata.condition.isna()
 counts_df = counts_df.loc[samples_to_keep]
-clinical_df = clinical_df.loc[samples_to_keep]
+metadata = metadata.loc[samples_to_keep]
 
 # %%
 # Next, we filter out genes that have less than 10 read counts in total. Note again that
@@ -118,12 +118,12 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We start by creating a :class:`DeseqDataSet`
-# object from the count and clinical data that were just loaded.
+# object from the count and metadata data that were just loaded.
 #
 
 dds = DeseqDataSet(
     counts=counts_df,
-    clinical=clinical_df,
+    metadata=metadata,
     design_factors="condition",
     refit_cooks=True,
     n_cpus=8,

diff --git a/examples/plot_step_by_step.py b/examples/plot_step_by_step.py
@@ -43,7 +43,7 @@
 #
 #   * A count matrix of shape 'number of samples' x 'number of genes', containing
 #     read counts (non-negative integers),
-#   * Clinical data (or "column" data) of shape 'number of samples' x
+#   * Metadata (or "column" data) of shape 'number of samples' x
 #     'number of variables', containing sample annotations that will be used
 #     to split the data in cohorts.
 #
@@ -63,8 +63,8 @@
     debug=False,
 )
 
-clinical_df = load_example_data(
-    modality="clinical",
+metadata = load_example_data(
+    modality="metadata",
     dataset="synthetic",
     debug=False,
 )
@@ -76,20 +76,20 @@
 # <dds.DeseqDataSet>` class
 #
 # The :class:`DeseqDataSet <dds.DeseqDataSet>` class has two mandatory
-# arguments, `counts_df` and
-# `clinical_df`, as well as a set of optional keyword arguments, among which:
+# arguments, ``counts`` and
+# ``metadata``, as well as a set of optional keyword arguments, among which:
 #
-# - `design_factor`: the name of the column of clinical to be used as a design
+# - ``design_factor``: the name of the column of metadata to be used as a design
 #   variable
-# - `refit_cooks`: whether to refit cooks outliers – this is advised, in general.
+# - ``refit_cooks``: whether to refit cooks outliers – this is advised, in general.
 #
 # .. note::
 #   in the case of the provided synthetic data, there won't be any Cooks
 #   outliers.
 
 dds = DeseqDataSet(
     counts=counts_df,
-    clinical=clinical_df,
+    metadata=metadata,
     design_factors="condition",  # compare samples based on the "condition"
     # column ("B" vs "A")
     refit_cooks=True,

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
@@ -56,19 +56,19 @@ class DeseqDataSet(ad.AnnData):
     ----------
     adata : anndata.AnnData
         AnnData from which to initialize the DeseqDataSet. Must have counts ('X') and
-        clinical metadata ('obs') fields. If ``None``, both ``counts`` and ``clinical``
+        sample metadata ('obs') fields. If ``None``, both ``counts`` and ``metadata``
         arguments must be provided.
 
     counts : pandas.DataFrame
         Raw counts. One column per gene, rows are indexed by sample barcodes.
 
-    clinical : pandas.DataFrame
-        DataFrame containing clinical information.
+    metadata : pandas.DataFrame
+        DataFrame containing sample metadata.
         Must be indexed by sample barcodes.
 
     design_factors : str or list
-        Name of the columns of clinical to be used as design variables.
-        Only bi-level factors are supported. (default: ``'condition'``).
+        Name of the columns of metadata to be used as design variables.
+        Only categorial factors are supported. (default: ``'condition'``).
 
     ref_level : list or None
         An optional list of two strings of the form ``["factor", "test_level"]``
@@ -168,7 +168,7 @@ def __init__(
         *,
         adata: Optional[ad.AnnData] = None,
         counts: Optional[pd.DataFrame] = None,
-        clinical: Optional[pd.DataFrame] = None,
+        metadata: Optional[pd.DataFrame] = None,
         design_factors: Union[str, List[str]] = "condition",
         ref_level: Optional[List[str]] = None,
         min_mu: float = 0.5,
@@ -188,21 +188,21 @@ def __init__(
                 warnings.warn(
                     "adata was provided; ignoring counts.", UserWarning, stacklevel=2
                 )
-            if clinical is not None:
+            if metadata is not None:
                 warnings.warn(
-                    "adata was provided; ignoring clinical.", UserWarning, stacklevel=2
+                    "adata was provided; ignoring metadata.", UserWarning, stacklevel=2
                 )
             # Test counts before going further
             test_valid_counts(adata.X)
             # Copy fields from original AnnData
             self.__dict__.update(adata.__dict__)
-        elif counts is not None and clinical is not None:
+        elif counts is not None and metadata is not None:
             # Test counts before going further
             test_valid_counts(counts)
-            super().__init__(X=counts.astype(int), obs=clinical)
+            super().__init__(X=counts.astype(int), obs=metadata)
         else:
             raise ValueError(
-                "Either adata or both counts and clinical arguments must be provided."
+                "Either adata or both counts and metadata arguments must be provided."
             )
 
         # Convert design_factors to list if a single string was provided.
@@ -216,7 +216,7 @@ def __init__(
         # Build the design matrix
         # Stored in the obsm attribute of the dataset
         self.obsm["design_matrix"] = build_design_matrix(
-            clinical_df=self.obs,
+            metadata=self.obs,
             design_factors=self.design_factors,
             ref_level=ref_level,
             expanded=False,
@@ -934,7 +934,7 @@ def _refit_without_outliers(
                 index=self.counts_to_refit.obs_names,
                 columns=self.counts_to_refit.var_names,
             ),
-            clinical=self.obs,
+            metadata=self.obs,
             design_factors=self.design_factors,
             ref_level=self.ref_level,
             min_mu=self.min_mu,

diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py
@@ -38,7 +38,7 @@ class DeseqStats:
     contrast : list or None
         A list of three strings, in the following format:
         ``['variable_of_interest', 'tested_level', 'ref_level']``.
-        Names must correspond to the clinical data passed to the DeseqDataSet.
+        Names must correspond to the metadata data passed to the DeseqDataSet.
         E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared
         to 'condition A'. If None, the last variable from the design matrix is chosen
         as the variable of interest, and the reference level is picked alphabetically.
@@ -611,7 +611,7 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
 
         A contrast should be a list of three strings, in the following format:
         ``['variable_of_interest', 'tested_level', 'reference_level']``.
-        Names must correspond to the clinical data passed to the DeseqDataSet.
+        Names must correspond to the metadata data passed to the DeseqDataSet.
         E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
         compared to 'condition A'. If None, the last variable from the design matrix
         is chosen as the variable of interest, and the reference level is picked