Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Dec 13, 2024
1 parent 746a38c commit 66fc170
Show file tree
Hide file tree
Showing 20 changed files with 410 additions and 129 deletions.
6 changes: 3 additions & 3 deletions README.md.rej
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
diff a/README.md b/README.md (rejected hunks)
@@ -17,7 +17,7 @@ Please refer to the [documentation][link-docs]. In particular, the

## Installation

-You need to have Python 3.9 or newer installed on your system. If you don't have
+You need to have Python 3.10 or newer installed on your system. If you don't have
Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).

There are several alternative options to install PopV:
4 changes: 3 additions & 1 deletion popv/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,9 @@ def verbosity(self, level: str | int):
console = Console(force_terminal=True)
if console.is_jupyter is True:
console.is_jupyter = False
ch = RichHandler(level=level, show_path=False, console=console, show_time=False)
ch = RichHandler(
level=level, show_path=False, console=console, show_time=False
)
formatter = logging.Formatter("%(message)s")
ch.setFormatter(formatter)
popv_logger.addHandler(ch)
Expand Down
22 changes: 17 additions & 5 deletions popv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def subsample_dataset(
if labels_counts[label] < n_samples_per_label:
sample_idx.append(label_locs)
else:
label_subset = np.random.choice(label_locs, n_samples_per_label, replace=False)
label_subset = np.random.choice(
label_locs, n_samples_per_label, replace=False
)
sample_idx.append(label_subset)
sample_idx = np.concatenate(sample_idx)
return adata.obs_names[sample_idx]
Expand Down Expand Up @@ -79,7 +81,9 @@ def check_genes_is_subset(ref_genes, query_genes):
logging.info("All ref genes are in query dataset. Can use pretrained models.")
is_subset = True
else:
logging.info("Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'.")
logging.info(
"Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'."
)
is_subset = False
return is_subset

Expand All @@ -95,7 +99,9 @@ def make_batch_covariate(adata, batch_keys, new_batch_key):
batch_keys
List of keys in adat.obs corresponding to batches
"""
adata.obs[new_batch_key] = adata.obs[batch_keys].astype(str).sum(1).astype("category")
adata.obs[new_batch_key] = (
adata.obs[batch_keys].astype(str).sum(1).astype("category")
)


def calculate_depths(g):
Expand Down Expand Up @@ -142,7 +148,9 @@ def make_ontology_dag(obofile, lowercase=False):
"""
co = obonet.read_obo(obofile, encoding="utf-8")
id_to_name = {id_: data.get("name") for id_, data in co.nodes(data=True)}
name_to_id = {data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)}
name_to_id = {
data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)
}

# get all node ids that are celltypes (start with CL)
cl_ids = {id_: True for _, id_ in name_to_id.items() if id_.startswith("CL:")}
Expand All @@ -160,7 +168,11 @@ def make_ontology_dag(obofile, lowercase=False):
for node in co.nodes():
if node in cl_ids:
for child, parent, key in co.out_edges(node, keys=True):
if child.startswith("CL:") and parent.startswith("CL:") and key == "is_a":
if (
child.startswith("CL:")
and parent.startswith("CL:")
and key == "is_a"
):
childname = id_to_name[child]
parentname = id_to_name[parent]
g.add_edge(childname, parentname, key=key)
Expand Down
16 changes: 12 additions & 4 deletions popv/algorithms/_bbknn.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def predict(self, adata):
]
)
if smallest_neighbor_graph < 15:
logging.warning(f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN.")
logging.warning(
f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN."
)
self.classifier_dict["n_neighbors"] = smallest_neighbor_graph

knn = KNeighborsClassifier(metric="precomputed", **self.classifier_dict)
Expand All @@ -95,9 +97,15 @@ def predict(self, adata):
adata.obs[self.result_key] = knn.predict(test_distances)

if adata.uns["_return_probabilities"]:
adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(test_distances), axis=1)
adata.obs[self.result_key + "_probabilities"] = np.max(
knn.predict_proba(test_distances), axis=1
)

def compute_embedding(self, adata):
if adata.uns["_compute_embedding"]:
logging.info(f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]')
adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
logging.info(
f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]'
)
adata.obsm[self.embedding_key] = sc.tl.umap(
adata, copy=True, **self.embedding_dict
).obsm["X_umap"]
8 changes: 6 additions & 2 deletions popv/algorithms/_celltypist.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,16 @@ def predict(self, adata):
**self.classifier_dict,
)
out_column = (
"majority_voting" if "majority_voting" in predictions.predicted_labels.columns else "predicted_labels"
"majority_voting"
if "majority_voting" in predictions.predicted_labels.columns
else "predicted_labels"
)

adata.obs[self.result_key] = predictions.predicted_labels[out_column]
if adata.uns["_return_probabilities"]:
adata.obs[self.result_key + "_probabilities"] = predictions.probability_matrix.max(axis=1).values
adata.obs[
self.result_key + "_probabilities"
] = predictions.probability_matrix.max(axis=1).values

def compute_embedding(self, adata):
pass
16 changes: 12 additions & 4 deletions popv/algorithms/_harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def __init__(
def compute_integration(self, adata):
logging.info("Integrating data with harmony")

adata.obsm["X_pca_harmony"] = harmonize(adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key)
adata.obsm["X_pca_harmony"] = harmonize(
adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key
)

def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
logging.info(f'Saving knn on harmony results to adata.obs["{result_key}"]')
Expand All @@ -75,7 +77,9 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
n_neighbors=self.classifier_dict["n_neighbors"],
parallel_batch_queries=True,
),
KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]),
KNeighborsClassifier(
metric="precomputed", weights=self.classifier_dict["weights"]
),
)

knn.fit(train_X, train_Y)
Expand All @@ -91,6 +95,10 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):

def compute_embedding(self, adata):
if adata.uns["_compute_embedding"]:
logging.info(f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]')
logging.info(
f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]'
)
sc.pp.neighbors(adata, use_rep="X_pca_harmony")
adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
adata.obsm[self.embedding_key] = sc.tl.umap(
adata, copy=True, **self.embedding_dict
).obsm["X_umap"]
38 changes: 27 additions & 11 deletions popv/algorithms/_onclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ def compute_integration(self, adata):
pass

def predict(self, adata):
logging.info(f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]')
adata.obs.loc[adata.obs["_dataset"] == "query", self.cell_ontology_obs_key] = adata.uns[
"unknown_celltype_label"
]
logging.info(
f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]'
)
adata.obs.loc[
adata.obs["_dataset"] == "query", self.cell_ontology_obs_key
] = adata.uns["unknown_celltype_label"]

train_idx = adata.obs["_dataset"] == "ref"

Expand All @@ -127,10 +129,14 @@ def predict(self, adata):
cl_ontology_file = adata.uns["_cl_ontology_file"]
nlp_emb_file = adata.uns["_nlp_emb_file"]

celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(cl_obo_file)
celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(
cl_obo_file
)
self.make_cell_ontology_id(adata, celltype_dict, self.cell_ontology_obs_key)

train_model = OnClassModel(cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file)
train_model = OnClassModel(
cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file
)

if adata.uns["_save_path_trained_models"] is not None:
model_path = adata.uns["_save_path_trained_models"] + "/OnClass"
Expand Down Expand Up @@ -175,13 +181,17 @@ def predict(self, adata):
)

if adata.uns["_prediction_mode"] == "fast":
onclass_seen = np.argmax(train_model.model.predict(corr_test_feature), axis=1)
onclass_seen = np.argmax(
train_model.model.predict(corr_test_feature), axis=1
)
pred_label = [train_model.i2co[ind] for ind in onclass_seen]
pred_label_str = [clid_2_name[ind] for ind in pred_label]
adata.obs[self.result_key] = pred_label_str
adata.obs[self.seen_result_key] = pred_label_str
else:
onclass_pred = train_model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0)
onclass_pred = train_model.Predict(
corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0
)
pred_label = [train_model.i2co[ind] for ind in onclass_pred[2]]
pred_label_str = [clid_2_name[ind] for ind in pred_label]
adata.obs[self.result_key] = pred_label_str
Expand All @@ -192,9 +202,15 @@ def predict(self, adata):
adata.obs[self.seen_result_key] = pred_label_str

if adata.uns["_return_probabilities"]:
adata.obs[self.result_key + "_probabilities"] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[1].sum(1, keepdims=True)
adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(onclass_pred[0], axis=1)
adata.obs[self.result_key + "_probabilities"] = np.max(
onclass_pred[1], axis=1
) / onclass_pred[1].sum(1)
adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[
1
].sum(1, keepdims=True)
adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(
onclass_pred[0], axis=1
)

def compute_embedding(self, adata):
return None
18 changes: 14 additions & 4 deletions popv/algorithms/_rf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,19 @@ def compute_integration(self, adata):
pass

def predict(self, adata):
logging.info(f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]')
logging.info(
f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]'
)

test_x = adata.layers[self.layers_key] if self.layers_key else adata.X

if adata.uns["_prediction_mode"] == "retrain":
train_idx = adata.obs["_ref_subsample"]
train_x = adata[train_idx].layers[self.layers_key] if self.layers_key else adata[train_idx].X
train_x = (
adata[train_idx].layers[self.layers_key]
if self.layers_key
else adata[train_idx].X
)
train_y = adata[train_idx].obs[self.labels_key].to_numpy()
rf = RandomForestClassifier(**self.classifier_dict)
rf.fit(train_x, train_y)
Expand All @@ -67,10 +73,14 @@ def predict(self, adata):
),
)
else:
rf = pickle.load(open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb"))
rf = pickle.load(
open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb")
)
adata.obs[self.result_key] = rf.predict(test_x)
if adata.uns["_return_probabilities"]:
adata.obs[self.result_key + "_probabilities"] = np.max(rf.predict_proba(test_x), axis=1)
adata.obs[self.result_key + "_probabilities"] = np.max(
rf.predict_proba(test_x), axis=1
)

def compute_embedding(self, adata):
pass
8 changes: 6 additions & 2 deletions popv/algorithms/_scaffold_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,16 @@ def compute_integration(self, adata):
# adata.obsm["X_new_method"] = embedded_data

def predict(self, adata):
logging.info(f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]')
logging.info(
f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]'
)
# adata.obs[self.result_key] = classifier_results

def compute_embedding(self, adata):
if adata.uns["_compute_embedding"]:
logging.info(f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]')
logging.info(
f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]'
)
# sc.pp.neighbors(adata, use_rep="embedding_space")
# adata.obsm[self.embedding_key] = sc.tl.umap(
# adata, copy=True, **self.embedding_dict
Expand Down
21 changes: 16 additions & 5 deletions popv/algorithms/_scanorama.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ def __init__(
def compute_integration(self, adata):
logging.info("Integrating data with scanorama")

_adatas = [adata[adata.obs[self.batch_key] == i] for i in np.unique(adata.obs[self.batch_key])]
_adatas = [
adata[adata.obs[self.batch_key] == i]
for i in np.unique(adata.obs[self.batch_key])
]
scanorama.integrate_scanpy(_adatas, **self.method_dict)
tmp_adata = anndata.concat(_adatas)
adata.obsm["X_scanorama"] = tmp_adata[adata.obs_names].obsm["X_scanorama"]
Expand All @@ -79,7 +82,9 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"):
n_neighbors=self.classifier_dict["n_neighbors"],
parallel_batch_queries=True,
),
KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]),
KNeighborsClassifier(
metric="precomputed", weights=self.classifier_dict["weights"]
),
)

knn.fit(train_X, train_Y)
Expand All @@ -89,10 +94,16 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"):
adata.obs[result_key] = knn_pred

if adata.uns["_return_probabilities"]:
adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(adata.obsm["X_scanorama"]), axis=1)
adata.obs[self.result_key + "_probabilities"] = np.max(
knn.predict_proba(adata.obsm["X_scanorama"]), axis=1
)

def compute_embedding(self, adata):
if adata.uns["_compute_embedding"]:
logging.info(f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]')
logging.info(
f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]'
)
sc.pp.neighbors(adata, use_rep="X_scanorama")
adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
adata.obsm[self.embedding_key] = sc.tl.umap(
adata, copy=True, **self.embedding_dict
).obsm["X_umap"]
Loading

0 comments on commit 66fc170

Please sign in to comment.