doc string updates

labsyspharm · Mar 1, 2024 · 5e8437e · 5e8437e
1 parent 44f84b6
commit 5e8437e
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 71 deletions.
diff --git a/scimap/preprocessing/_combat.py b/scimap/preprocessing/_combat.py
@@ -53,7 +53,6 @@ def combat(
         The corrected expression data is stored in a new layer `adata.layers['combat']`.
 
 Examples:
-
     ```python
 
     # applying batch correction using raw data

diff --git a/scimap/preprocessing/_rescale.py b/scimap/preprocessing/_rescale.py
@@ -28,32 +28,33 @@ def rescale (adata,
              imageid='imageid', 
              failed_markers=None,
              method='all',
+             verbose=True,
              random_state=0):
     """
 Parameters:
-    adata (AnnData Object):  
+    adata (AnnData Object, required):  
         An annotated data object that contains single-cell expression data.
 
     gate (DataFrame, optional):   
-        A pandas DataFrame where the first column lists markers, and subsequent columns contain gate values for each image in the dataset. Column names must correspond to unique `imageid` identifiers. If a single column of gate values is provided for a dataset with multiple images, the same gate will be uniformly applied to all. If no gates are provided for specific markers, the function attempts to automatically determine gates using a Gaussian Mixture Model (GMM). Defaults to None.
+        A pandas DataFrame where the first column lists markers, and subsequent columns contain gate values for each image in the dataset. Column names must correspond to unique `imageid` identifiers. If a single column of gate values is provided for a dataset with multiple images, the same gate will be uniformly applied to all. If no gates are provided for specific markers, the function attempts to automatically determine gates using a Gaussian Mixture Model (GMM). 
         
     log (bool, optional):  
-        If True, the data in `adata.raw.X` will be log-transformed (using log1p) before gate application. This transformation is recommended when automatic gate identification through GMM is performed, as it helps in normalizing data distributions. Defaults to True.
+        If `True`, the data in `adata.raw.X` will be log-transformed (using log1p) before gate application. This transformation is recommended when automatic gate identification through GMM is performed, as it helps in normalizing data distributions. 
         
     imageid (str, optional):  
-        The name of the column in `adata` that contains Image IDs. This is necessary for matching manual gates specified in the `gate` DataFrame to their respective images. Defaults to 'imageid'.
+        The name of the column in `adata` that contains Image IDs. This is necessary for matching manual gates specified in the `gate` DataFrame to their respective images. 
         
     failed_markers (dict, optional):  
-        A dictionary mapping `imageid` to markers that failed quality control. This allows for the exclusion of specific markers from the analysis based on prior visual inspection or other criteria. The dictionary can use 'all' as a key to specify markers that failed across all images. Defaults to None.
+        A dictionary mapping `imageid` to markers that failed quality control. This allows for the exclusion of specific markers from the analysis based on prior visual inspection or other criteria. The dictionary can use `all` as a key to specify markers that failed across all images. 
         
     method (str, optional):  
-        Specifies the gating strategy: 'all' to pool data from all images for GMM application, or 'by_image' to apply GMM separately for each image. 'all' may introduce batch effects, while 'by_image' requires sufficient variation within each image to distinguish negative from positive populations effectively. Defaults to 'by_image'.
+        Specifies the gating strategy: `all` to pool data from all images for GMM application, or `by_image` to apply GMM separately for each image. `all` may introduce batch effects, while `by_image` requires sufficient variation within each image to distinguish negative from positive populations effectively. 
         
     random_state (int, optional):  
-        The seed used by the random number generator for GMM. Ensures reproducibility of results. Defaults to 0.
+        The seed used by the random number generator for GMM. Ensures reproducibility of results.
 
     verbose (bool, optional):  
-        If True, detailed progress updates and diagnostic messages will be printed during the function's execution. This is useful for tracking the processing stages and debugging. Defaults to False.
+        If `True`, detailed progress updates and diagnostic messages will be printed during the function's execution.
 
 Returns:
     Modified AnnData Object (AnnData):  
@@ -64,14 +65,14 @@ def rescale (adata,
     
     # Example with manual gates
     manual_gate = pd.DataFrame({'marker': ['CD3D', 'KI67'], 'gate': [7, 8]}) 
-    adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
+    adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']})
         
     # Importing gates from a CSV
     manual_gate = pd.read_csv('manual_gates.csv')
-    adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
+    adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']})
         
     # Running without manual gates to use GMM for automatic gate determination
-    adata = sm.pp.rescale(adata, gate=None, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
+    adata = sm.pp.rescale(adata, gate=None, failed_markers={'all': ['CD20', 'CD21']})
     
     ```
 
@@ -84,8 +85,8 @@ def rescale (adata,
         adata.raw = adata
 
     # Mapping between markers and gates in the given dataset
-    dataset_markers = list(adata.var.index)
-    dataset_images = list(adata.obs[imageid].unique())
+    dataset_markers = adata.var.index.tolist()
+    dataset_images = adata.obs[imageid].unique().tolist()    
     m= pd.DataFrame(index=dataset_markers, columns=dataset_images).reset_index()
     m= pd.melt(m, id_vars=[m.columns[0]])
     m.columns = ['markers', 'imageid', 'gate']
@@ -106,7 +107,8 @@ def rescale (adata,
 
     # Addressing failed markers
     def process_failed (adata_subset, foramted_failed_markers):
-        print('Processing Failed Marker in ' + str(adata_subset.obs[imageid].unique()[0]))
+        if verbose:
+            print('Processing Failed Marker in ' + str(adata_subset.obs[imageid].unique()[0]))
         # prepare data
         data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)
         if log is True:
@@ -178,15 +180,17 @@ def clipping (x):
 
     # Find GMM based gates
     def gmm_gating (marker, data):
-        print('Finding the optimal gate by GMM for ' + str(marker))
+        if verbose:
+            print('Finding the optimal gate by GMM for ' + str(marker))
         data_gm = data[marker].values.reshape(-1, 1)
         gmm = GaussianMixture(n_components=2, random_state=random_state).fit(data_gm)
         gate = np.mean(gmm.means_)
         return gate
 
     # Running gmm_gating on the dataset
     def gmm_gating_internal (adata_subset, gate_mapping, method):
-        print('GMM for ' + str(adata_subset.obs[imageid].unique()))
+        if verbose:
+            print('GMM for ' + str(adata_subset.obs[imageid].unique()))
         data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)      
         # find markers
         if method == 'all':
@@ -240,7 +244,8 @@ def gmm_gating_internal (adata_subset, gate_mapping, method):
 
     # Rescaling function
     def data_scaler (adata_subset, gate_mapping):
-        print('Scaling Image ' + str(adata_subset.obs[imageid].unique()[0]))
+        if verbose:
+            print('Scaling Image ' + str(adata_subset.obs[imageid].unique()[0]))
         # Organise data
         data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)
         if log is True:
@@ -250,7 +255,8 @@ def data_scaler (adata_subset, gate_mapping):
 
         # organise gates
         def data_scaler_internal (marker, gate_mapping_sub):
-            print('Scaling ' + str(marker))
+            if verbose:
+                print('Scaling ' + str(marker))
             # find the gate
             moi = gate_mapping_sub[gate_mapping_sub.markers == marker]['gate'].values[0]
 
@@ -338,6 +344,7 @@ def data_scaler_internal (marker, gate_mapping_sub):
     parser.add_argument('--imageid', type=str, default='imageid', help='The column containing the Image IDs')
     parser.add_argument('--failedmarkers', type=str, default=None, help='Markers that were deemed to have failed based on prior visual inspection')
     parser.add_argument('--method', type=str, default='all', help='Two avialble option are- all or by_image')
+    parser.add_argument('--verbose', type=bool, default=None, help='The function will print detailed messages about its progress')
     parser.add_argument('--randomstate', type=str, default=0, help='Seed for GMM. The default is 0')
     args = parser.parse_args()
 

diff --git a/scimap/tools/_phenotype_cells.py b/scimap/tools/_phenotype_cells.py
@@ -4,29 +4,21 @@
 # @author: Ajit Johnson Nirmal
 """
 !!! abstract "Short Description"
-    `sm.tl.phenotype_cells`: The phenotyping function takes in the `scaled data` and a prior knowledge based `phenotype workflow` 
-    file to assign phenotype annotation to each cell in the dataset. Use the `sm.tl.rescale` function to scale the data first. 
-        
-    *Phenotype workflow file description:*  
-    An example of the `phenotype_workflow.csv` can be found [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).  
-        
-    The `phenotype_workflow` accepts six categories of gating strategy for performing phenotyping.
-        
-    - allpos
-    - allneg
-    - anypos
-    - anyneg
-    - pos
-    - neg
-        
-    `allpos`- All of the defined markers should be positive.  
-    `allneg`- All of the defined markers should be negative.  
-    `anypos`- Any one of the defined marker is sufficient to be positive. (e.g) For defining macrophages, one could use a strategy in which a cell is defined as a macrophage if any of `CD68, CD163 or CD206` is positive.  
-    `anyneg`- Any of the defined marker is negative.  
-    `pos`- A given marker is positive. If this argument is passed to multiple markers. (e.g) If regulatory T cell is defined as `CD4+`, `FOXP3+` by passing `pos` to each the markers and the algorithm finds that for a few cells one of the two is not, the algorithm will assign the cell as likely-regulatory T cell and will allow the user to make the decision later.  
-    `neg`- A given marker is negative.  
-
-    *It is always advised to use positive markers over negative markers*  
+    `sm.tl.phenotype_cells`: This function annotates each cell in the dataset with a phenotype based on `scaled data` and a predefined `phenotype workflow`. Before using this function, ensure the data is scaled with the `sm.tl.rescale` function.
+    
+    *Description of the Phenotype Workflow File:*  
+    Find an example `phenotype_workflow.csv` [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
+    
+    The `phenotype_workflow` file outlines six gating strategies for cell phenotyping:
+    
+    - **allpos**: Requires all specified markers to be positive for a cell to be assigned the phenotype.
+    - **allneg**: Requires all specified markers to be negative for a cell to be assigned the phenotype.
+    - **anypos**: Requires at least one of the specified markers to be positive for a cell to be assigned the phenotype. For example, a macrophage could be identified if it is positive for any of the markers `CD68`, `CD163`, or `CD206`.
+    - **anyneg**: Requires at least one of the specified markers to be negative for a cell to be assigned the phenotype.
+    - **pos**: Specifies that a cell must be positive for the given marker(s) to be assigned the phenotype. If used for multiple markers, cells not meeting all criteria may still be classified as a potential phenotype, allowing for later refinement by the user. For instance, regulatory T cells could be defined as `CD4+` and `FOXP3+`; cells not fully meeting these criteria might be labeled as likely-regulatory T cells for further evaluation.
+    - **neg**: Specifies that a cell must be negative for the given marker(s) to be assigned the phenotype.
+    
+    *Recommendation*: Prioritize using positive markers to define phenotypes whenever possible.
 
 ## Function
 """
@@ -43,46 +35,49 @@ def phenotype_cells (adata,
                      label="phenotype", 
                      imageid='imageid',
                      pheno_threshold_percent=None, 
-                     pheno_threshold_abs=None):
+                     pheno_threshold_abs=None,
+                     verbose=True
+                     ):
     """
     
 Parameters:
+    adata (AnnData):  
+        The input AnnData object containing single-cell data for phenotyping.
 
-    adata : anndata object
-    
-    phenotype (dataframe):   
-        A gating strategy for phenotyping the cells. An example `workflow` provided [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
+    phenotype (pd.DataFrame):  
+        A DataFrame specifying the gating strategy for cell phenotyping. It should outline the workflow for phenotype classification based on marker expression levels. An example workflow is available at [this GitHub link](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
         
-    gate (int):  
-        By default rescale function, scales the data such that values above 0.5 are considered positive cells.
+    gate (float, optional):  
+        The threshold value for determining positive cell classification based on scaled data. By convention, values above this threshold are considered to indicate positive cells. 
         
-    label (string):  
-        Name the column underwhich the final phenotype calling will be saved.
+    label (str):  
+        The name of the column in `adata.obs` where the final phenotype classifications will be stored. This label will be used to access the phenotyping results within the `AnnData` object.
         
-    imageid (string):  
-        Name of the column that contains the unique imageid. This is only utilized
-        when the user uses `pheno_threshold_percent` or `pheno_threshold_abs` parameters.
+    imageid (str, optional):  
+        The name of the column in `adata.obs` that contains unique image identifiers. This is crucial for analyses that require differentiation of data based on the source image, especially when using phenotype threshold parameters (`pheno_threshold_percent` or `pheno_threshold_abs`).
         
-    pheno_threshold_percent (float):  
-        Accepts values between (0-100). If any particular phenotype is below the user defined threshold,
-        it is recategorised as 'unknown. Generally used to deal with low background false positives.
+    pheno_threshold_percent (float, optional):  
+        A threshold value (between 0 and 100) specifying the minimum percentage of cells that must exhibit a particular phenotype for it to be considered valid. Phenotypes not meeting this threshold are reclassified as 'unknown'. This parameter is useful for minimizing the impact of low-frequency false positives. 
         
-    pheno_threshold_abs (int):  
-        Serves the same purpose as that of pheno_threshold_percent. However, an absolute
-        number can be passed. For example, if user passes in 10- any phenotype that contains
-        less than 10 cells will be recategorized as unknown.
+    pheno_threshold_abs (int, optional):  
+        Similar to `pheno_threshold_percent`, but uses an absolute cell count instead of a percentage. Phenotypes with cell counts below this threshold are reclassified as 'unknown'. This can help in addressing rare phenotype classifications that may not be meaningful. 
+    
+    verbose (bool):  
+        If set to `True`, the function will print detailed messages about its progress and the steps being executed.
 
 Returns:
-    adata
-        Updated AnnData object with the phenotype calls for each cell. Check `adata.obs['phenotype']` for results.
+    AnnData:  
+        The input AnnData object, updated to include the phenotype classifications for each cell. The phenotyping results can be found in `adata.obs[label]`, where `label` is the name specified by the user for the phenotype column.
 
 Example:    
     ```python
-    # load the workflow csv file
+    
+    # Load the phenotype workflow CSV file
     phenotype = pd.read_csv('path/to/csv/file/')  
-    # phenotype the cells based on the workflow provided
-    adata = sm.tl.phenotype_cells (adata, phenotype=phenotype, 
-    gate = 0.5, label="phenotype")
+    
+    # Apply phenotyping to cells based on the specified workflow
+    adata = sm.tl.phenotype_cells(adata, phenotype=phenotype, gate=0.5, label="phenotype")
+    
     ```
 
     """
@@ -133,8 +128,9 @@ def gate_satisfation_morethan (marker, data, gate):
         r_gate_satisfation_morethan = lambda x: gate_satisfation_morethan(marker=x, data=data, gate=gate)
 
         def prob_mapper (data, all_phenotype, cell, gate):
-
-            print("Phenotyping " + str(cell))
+
+            if verbose:
+                print("Phenotyping " + str(cell))
 
             # Get the appropriate dict from all_phenotype
             p = all_phenotype[cell]
@@ -280,15 +276,17 @@ def prob_mapper (data, all_phenotype, cell, gate):
                 #cells_of_interest = phenotype_labels[phenotype_labels[column_of_interest] == i].index
                 cells_of_interest = phenotype_labels[phenotype_labels[column_of_interest].eq(i).any(axis=1)].index
                 d = data.loc[cells_of_interest]
-                print("-- Subsetting " + str(i))
+                if verbose:
+                    print("-- Subsetting " + str(i))
                 phenotype_l = pd.DataFrame(phenotype_cells(data = d, group = i, phenotype=phenotype, gate=gate), columns = [i])
                 phenotype_labels = phenotype_labels.merge(phenotype_l, how='outer', left_index=True, right_index=True)
 
     # Rearrange the rows back to original
     phenotype_labels = phenotype_labels.reindex(data.index)
     phenotype_labels = phenotype_labels.replace('-rest', np.nan, regex=True)
 
-    print("Consolidating the phenotypes across all groups")
+    if verbose:
+        print("Consolidating the phenotypes across all groups")
     phenotype_labels_Consolidated = phenotype_labels.fillna(method='ffill', axis = 1)
     phenotype_labels[label] = phenotype_labels_Consolidated.iloc[:,-1].values
 
@@ -342,6 +340,7 @@ def remove_phenotype(p, ID, pheno_threshold_percent, pheno_threshold_abs):
     parser.add_argument('--imageid', type=str, default='imageid', help='Name of the column that contains the unique imageid')
     parser.add_argument('--pheno_threshold_percent', type=float, default=True, help='Accepts values between (0-100). If any particular phenotype is below the user defined threshold, it is recategorised as unknown')
     parser.add_argument('--pheno_threshold_abs', type=int, default=None, help='Serves the same purpose as that of pheno_threshold_percent. However, an absolute number can be passed')
+    parser.add_argument('--verbose', required=False, default=True, help='The function will print detailed messages about its progress.')
     args = parser.parse_args()
 
     phenotype_cells(adata=args.adata,