Skip to content

Commit

Permalink
doc string updates
Browse files Browse the repository at this point in the history
  • Loading branch information
ajitjohnson committed Mar 1, 2024
1 parent 44f84b6 commit 5e8437e
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 71 deletions.
1 change: 0 additions & 1 deletion scimap/preprocessing/_combat.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def combat(
The corrected expression data is stored in a new layer `adata.layers['combat']`.
Examples:
```python
# applying batch correction using raw data
Expand Down
43 changes: 25 additions & 18 deletions scimap/preprocessing/_rescale.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,33 @@ def rescale (adata,
imageid='imageid',
failed_markers=None,
method='all',
verbose=True,
random_state=0):
"""
Parameters:
adata (AnnData Object):
adata (AnnData Object, required):
An annotated data object that contains single-cell expression data.
gate (DataFrame, optional):
A pandas DataFrame where the first column lists markers, and subsequent columns contain gate values for each image in the dataset. Column names must correspond to unique `imageid` identifiers. If a single column of gate values is provided for a dataset with multiple images, the same gate will be uniformly applied to all. If no gates are provided for specific markers, the function attempts to automatically determine gates using a Gaussian Mixture Model (GMM). Defaults to None.
A pandas DataFrame where the first column lists markers, and subsequent columns contain gate values for each image in the dataset. Column names must correspond to unique `imageid` identifiers. If a single column of gate values is provided for a dataset with multiple images, the same gate will be uniformly applied to all. If no gates are provided for specific markers, the function attempts to automatically determine gates using a Gaussian Mixture Model (GMM).
log (bool, optional):
If True, the data in `adata.raw.X` will be log-transformed (using log1p) before gate application. This transformation is recommended when automatic gate identification through GMM is performed, as it helps in normalizing data distributions. Defaults to True.
If `True`, the data in `adata.raw.X` will be log-transformed (using log1p) before gate application. This transformation is recommended when automatic gate identification through GMM is performed, as it helps in normalizing data distributions.
imageid (str, optional):
The name of the column in `adata` that contains Image IDs. This is necessary for matching manual gates specified in the `gate` DataFrame to their respective images. Defaults to 'imageid'.
The name of the column in `adata` that contains Image IDs. This is necessary for matching manual gates specified in the `gate` DataFrame to their respective images.
failed_markers (dict, optional):
A dictionary mapping `imageid` to markers that failed quality control. This allows for the exclusion of specific markers from the analysis based on prior visual inspection or other criteria. The dictionary can use 'all' as a key to specify markers that failed across all images. Defaults to None.
A dictionary mapping `imageid` to markers that failed quality control. This allows for the exclusion of specific markers from the analysis based on prior visual inspection or other criteria. The dictionary can use `all` as a key to specify markers that failed across all images.
method (str, optional):
Specifies the gating strategy: 'all' to pool data from all images for GMM application, or 'by_image' to apply GMM separately for each image. 'all' may introduce batch effects, while 'by_image' requires sufficient variation within each image to distinguish negative from positive populations effectively. Defaults to 'by_image'.
Specifies the gating strategy: `all` to pool data from all images for GMM application, or `by_image` to apply GMM separately for each image. `all` may introduce batch effects, while `by_image` requires sufficient variation within each image to distinguish negative from positive populations effectively.
random_state (int, optional):
The seed used by the random number generator for GMM. Ensures reproducibility of results. Defaults to 0.
The seed used by the random number generator for GMM. Ensures reproducibility of results.
verbose (bool, optional):
If True, detailed progress updates and diagnostic messages will be printed during the function's execution. This is useful for tracking the processing stages and debugging. Defaults to False.
If `True`, detailed progress updates and diagnostic messages will be printed during the function's execution.
Returns:
Modified AnnData Object (AnnData):
Expand All @@ -64,14 +65,14 @@ def rescale (adata,
# Example with manual gates
manual_gate = pd.DataFrame({'marker': ['CD3D', 'KI67'], 'gate': [7, 8]})
adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']})
# Importing gates from a CSV
manual_gate = pd.read_csv('manual_gates.csv')
adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
adata = sm.pp.rescale(adata, gate=manual_gate, failed_markers={'all': ['CD20', 'CD21']})
# Running without manual gates to use GMM for automatic gate determination
adata = sm.pp.rescale(adata, gate=None, failed_markers={'all': ['CD20', 'CD21']}, verbose=True)
adata = sm.pp.rescale(adata, gate=None, failed_markers={'all': ['CD20', 'CD21']})
```
Expand All @@ -84,8 +85,8 @@ def rescale (adata,
adata.raw = adata

# Mapping between markers and gates in the given dataset
dataset_markers = list(adata.var.index)
dataset_images = list(adata.obs[imageid].unique())
dataset_markers = adata.var.index.tolist()
dataset_images = adata.obs[imageid].unique().tolist()
m= pd.DataFrame(index=dataset_markers, columns=dataset_images).reset_index()
m= pd.melt(m, id_vars=[m.columns[0]])
m.columns = ['markers', 'imageid', 'gate']
Expand All @@ -106,7 +107,8 @@ def rescale (adata,

# Addressing failed markers
def process_failed (adata_subset, foramted_failed_markers):
print('Processing Failed Marker in ' + str(adata_subset.obs[imageid].unique()[0]))
if verbose:
print('Processing Failed Marker in ' + str(adata_subset.obs[imageid].unique()[0]))
# prepare data
data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)
if log is True:
Expand Down Expand Up @@ -178,15 +180,17 @@ def clipping (x):

# Find GMM based gates
def gmm_gating (marker, data):
print('Finding the optimal gate by GMM for ' + str(marker))
if verbose:
print('Finding the optimal gate by GMM for ' + str(marker))
data_gm = data[marker].values.reshape(-1, 1)
gmm = GaussianMixture(n_components=2, random_state=random_state).fit(data_gm)
gate = np.mean(gmm.means_)
return gate

# Running gmm_gating on the dataset
def gmm_gating_internal (adata_subset, gate_mapping, method):
print('GMM for ' + str(adata_subset.obs[imageid].unique()))
if verbose:
print('GMM for ' + str(adata_subset.obs[imageid].unique()))
data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)
# find markers
if method == 'all':
Expand Down Expand Up @@ -240,7 +244,8 @@ def gmm_gating_internal (adata_subset, gate_mapping, method):

# Rescaling function
def data_scaler (adata_subset, gate_mapping):
print('Scaling Image ' + str(adata_subset.obs[imageid].unique()[0]))
if verbose:
print('Scaling Image ' + str(adata_subset.obs[imageid].unique()[0]))
# Organise data
data_subset = pd.DataFrame(adata_subset.raw.X, columns=adata_subset.var.index, index=adata_subset.obs.index)
if log is True:
Expand All @@ -250,7 +255,8 @@ def data_scaler (adata_subset, gate_mapping):

# organise gates
def data_scaler_internal (marker, gate_mapping_sub):
print('Scaling ' + str(marker))
if verbose:
print('Scaling ' + str(marker))
# find the gate
moi = gate_mapping_sub[gate_mapping_sub.markers == marker]['gate'].values[0]

Expand Down Expand Up @@ -338,6 +344,7 @@ def data_scaler_internal (marker, gate_mapping_sub):
parser.add_argument('--imageid', type=str, default='imageid', help='The column containing the Image IDs')
parser.add_argument('--failedmarkers', type=str, default=None, help='Markers that were deemed to have failed based on prior visual inspection')
parser.add_argument('--method', type=str, default='all', help='Two avialble option are- all or by_image')
parser.add_argument('--verbose', type=bool, default=None, help='The function will print detailed messages about its progress')
parser.add_argument('--randomstate', type=str, default=0, help='Seed for GMM. The default is 0')
args = parser.parse_args()

Expand Down
103 changes: 51 additions & 52 deletions scimap/tools/_phenotype_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,21 @@
# @author: Ajit Johnson Nirmal
"""
!!! abstract "Short Description"
`sm.tl.phenotype_cells`: The phenotyping function takes in the `scaled data` and a prior knowledge based `phenotype workflow`
file to assign phenotype annotation to each cell in the dataset. Use the `sm.tl.rescale` function to scale the data first.
*Phenotype workflow file description:*
An example of the `phenotype_workflow.csv` can be found [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
The `phenotype_workflow` accepts six categories of gating strategy for performing phenotyping.
- allpos
- allneg
- anypos
- anyneg
- pos
- neg
`allpos`- All of the defined markers should be positive.
`allneg`- All of the defined markers should be negative.
`anypos`- Any one of the defined marker is sufficient to be positive. (e.g) For defining macrophages, one could use a strategy in which a cell is defined as a macrophage if any of `CD68, CD163 or CD206` is positive.
`anyneg`- Any of the defined marker is negative.
`pos`- A given marker is positive. If this argument is passed to multiple markers. (e.g) If regulatory T cell is defined as `CD4+`, `FOXP3+` by passing `pos` to each the markers and the algorithm finds that for a few cells one of the two is not, the algorithm will assign the cell as likely-regulatory T cell and will allow the user to make the decision later.
`neg`- A given marker is negative.
*It is always advised to use positive markers over negative markers*
`sm.tl.phenotype_cells`: This function annotates each cell in the dataset with a phenotype based on `scaled data` and a predefined `phenotype workflow`. Before using this function, ensure the data is scaled with the `sm.tl.rescale` function.
*Description of the Phenotype Workflow File:*
Find an example `phenotype_workflow.csv` [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
The `phenotype_workflow` file outlines six gating strategies for cell phenotyping:
- **allpos**: Requires all specified markers to be positive for a cell to be assigned the phenotype.
- **allneg**: Requires all specified markers to be negative for a cell to be assigned the phenotype.
- **anypos**: Requires at least one of the specified markers to be positive for a cell to be assigned the phenotype. For example, a macrophage could be identified if it is positive for any of the markers `CD68`, `CD163`, or `CD206`.
- **anyneg**: Requires at least one of the specified markers to be negative for a cell to be assigned the phenotype.
- **pos**: Specifies that a cell must be positive for the given marker(s) to be assigned the phenotype. If used for multiple markers, cells not meeting all criteria may still be classified as a potential phenotype, allowing for later refinement by the user. For instance, regulatory T cells could be defined as `CD4+` and `FOXP3+`; cells not fully meeting these criteria might be labeled as likely-regulatory T cells for further evaluation.
- **neg**: Specifies that a cell must be negative for the given marker(s) to be assigned the phenotype.
*Recommendation*: Prioritize using positive markers to define phenotypes whenever possible.
## Function
"""
Expand All @@ -43,46 +35,49 @@ def phenotype_cells (adata,
label="phenotype",
imageid='imageid',
pheno_threshold_percent=None,
pheno_threshold_abs=None):
pheno_threshold_abs=None,
verbose=True
):
"""
Parameters:
adata (AnnData):
The input AnnData object containing single-cell data for phenotyping.
adata : anndata object
phenotype (dataframe):
A gating strategy for phenotyping the cells. An example `workflow` provided [here](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
phenotype (pd.DataFrame):
A DataFrame specifying the gating strategy for cell phenotyping. It should outline the workflow for phenotype classification based on marker expression levels. An example workflow is available at [this GitHub link](https://github.com/ajitjohnson/scimap/blob/master/scimap/tests/_data/phenotype_workflow.csv).
gate (int):
By default rescale function, scales the data such that values above 0.5 are considered positive cells.
gate (float, optional):
The threshold value for determining positive cell classification based on scaled data. By convention, values above this threshold are considered to indicate positive cells.
label (string):
Name the column underwhich the final phenotype calling will be saved.
label (str):
The name of the column in `adata.obs` where the final phenotype classifications will be stored. This label will be used to access the phenotyping results within the `AnnData` object.
imageid (string):
Name of the column that contains the unique imageid. This is only utilized
when the user uses `pheno_threshold_percent` or `pheno_threshold_abs` parameters.
imageid (str, optional):
The name of the column in `adata.obs` that contains unique image identifiers. This is crucial for analyses that require differentiation of data based on the source image, especially when using phenotype threshold parameters (`pheno_threshold_percent` or `pheno_threshold_abs`).
pheno_threshold_percent (float):
Accepts values between (0-100). If any particular phenotype is below the user defined threshold,
it is recategorised as 'unknown. Generally used to deal with low background false positives.
pheno_threshold_percent (float, optional):
A threshold value (between 0 and 100) specifying the minimum percentage of cells that must exhibit a particular phenotype for it to be considered valid. Phenotypes not meeting this threshold are reclassified as 'unknown'. This parameter is useful for minimizing the impact of low-frequency false positives.
pheno_threshold_abs (int):
Serves the same purpose as that of pheno_threshold_percent. However, an absolute
number can be passed. For example, if user passes in 10- any phenotype that contains
less than 10 cells will be recategorized as unknown.
pheno_threshold_abs (int, optional):
Similar to `pheno_threshold_percent`, but uses an absolute cell count instead of a percentage. Phenotypes with cell counts below this threshold are reclassified as 'unknown'. This can help in addressing rare phenotype classifications that may not be meaningful.
verbose (bool):
If set to `True`, the function will print detailed messages about its progress and the steps being executed.
Returns:
adata
Updated AnnData object with the phenotype calls for each cell. Check `adata.obs['phenotype']` for results.
AnnData:
The input AnnData object, updated to include the phenotype classifications for each cell. The phenotyping results can be found in `adata.obs[label]`, where `label` is the name specified by the user for the phenotype column.
Example:
```python
# load the workflow csv file
# Load the phenotype workflow CSV file
phenotype = pd.read_csv('path/to/csv/file/')
# phenotype the cells based on the workflow provided
adata = sm.tl.phenotype_cells (adata, phenotype=phenotype,
gate = 0.5, label="phenotype")
# Apply phenotyping to cells based on the specified workflow
adata = sm.tl.phenotype_cells(adata, phenotype=phenotype, gate=0.5, label="phenotype")
```
"""
Expand Down Expand Up @@ -133,8 +128,9 @@ def gate_satisfation_morethan (marker, data, gate):
r_gate_satisfation_morethan = lambda x: gate_satisfation_morethan(marker=x, data=data, gate=gate)

def prob_mapper (data, all_phenotype, cell, gate):

print("Phenotyping " + str(cell))

if verbose:
print("Phenotyping " + str(cell))

# Get the appropriate dict from all_phenotype
p = all_phenotype[cell]
Expand Down Expand Up @@ -280,15 +276,17 @@ def prob_mapper (data, all_phenotype, cell, gate):
#cells_of_interest = phenotype_labels[phenotype_labels[column_of_interest] == i].index
cells_of_interest = phenotype_labels[phenotype_labels[column_of_interest].eq(i).any(axis=1)].index
d = data.loc[cells_of_interest]
print("-- Subsetting " + str(i))
if verbose:
print("-- Subsetting " + str(i))
phenotype_l = pd.DataFrame(phenotype_cells(data = d, group = i, phenotype=phenotype, gate=gate), columns = [i])
phenotype_labels = phenotype_labels.merge(phenotype_l, how='outer', left_index=True, right_index=True)

# Rearrange the rows back to original
phenotype_labels = phenotype_labels.reindex(data.index)
phenotype_labels = phenotype_labels.replace('-rest', np.nan, regex=True)

print("Consolidating the phenotypes across all groups")
if verbose:
print("Consolidating the phenotypes across all groups")
phenotype_labels_Consolidated = phenotype_labels.fillna(method='ffill', axis = 1)
phenotype_labels[label] = phenotype_labels_Consolidated.iloc[:,-1].values

Expand Down Expand Up @@ -342,6 +340,7 @@ def remove_phenotype(p, ID, pheno_threshold_percent, pheno_threshold_abs):
parser.add_argument('--imageid', type=str, default='imageid', help='Name of the column that contains the unique imageid')
parser.add_argument('--pheno_threshold_percent', type=float, default=True, help='Accepts values between (0-100). If any particular phenotype is below the user defined threshold, it is recategorised as unknown')
parser.add_argument('--pheno_threshold_abs', type=int, default=None, help='Serves the same purpose as that of pheno_threshold_percent. However, an absolute number can be passed')
parser.add_argument('--verbose', required=False, default=True, help='The function will print detailed messages about its progress.')
args = parser.parse_args()

phenotype_cells(adata=args.adata,
Expand Down

0 comments on commit 5e8437e

Please sign in to comment.