[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
YosefLab · Dec 13, 2024 · 66fc170 · 66fc170
1 parent 746a38c
commit 66fc170
Show file tree

Hide file tree

Showing 20 changed files with 410 additions and 129 deletions.
diff --git a/README.md.rej b/README.md.rej
@@ -1,10 +1,10 @@
 diff a/README.md b/README.md	(rejected hunks)
 @@ -17,7 +17,7 @@ Please refer to the [documentation][link-docs]. In particular, the
- 
+
  ## Installation
- 
+
 -You need to have Python 3.9 or newer installed on your system. If you don't have
 +You need to have Python 3.10 or newer installed on your system. If you don't have
  Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).
- 
+
  There are several alternative options to install PopV:
diff --git a/popv/_settings.py b/popv/_settings.py
@@ -101,7 +101,9 @@ def verbosity(self, level: str | int):
             console = Console(force_terminal=True)
             if console.is_jupyter is True:
                 console.is_jupyter = False
-            ch = RichHandler(level=level, show_path=False, console=console, show_time=False)
+            ch = RichHandler(
+                level=level, show_path=False, console=console, show_time=False
+            )
             formatter = logging.Formatter("%(message)s")
             ch.setFormatter(formatter)
             popv_logger.addHandler(ch)

diff --git a/popv/_utils.py b/popv/_utils.py
@@ -49,7 +49,9 @@ def subsample_dataset(
         if labels_counts[label] < n_samples_per_label:
             sample_idx.append(label_locs)
         else:
-            label_subset = np.random.choice(label_locs, n_samples_per_label, replace=False)
+            label_subset = np.random.choice(
+                label_locs, n_samples_per_label, replace=False
+            )
             sample_idx.append(label_subset)
     sample_idx = np.concatenate(sample_idx)
     return adata.obs_names[sample_idx]
@@ -79,7 +81,9 @@ def check_genes_is_subset(ref_genes, query_genes):
         logging.info("All ref genes are in query dataset. Can use pretrained models.")
         is_subset = True
     else:
-        logging.info("Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'.")
+        logging.info(
+            "Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'."
+        )
         is_subset = False
     return is_subset
 
@@ -95,7 +99,9 @@ def make_batch_covariate(adata, batch_keys, new_batch_key):
     batch_keys
         List of keys in adat.obs corresponding to batches
     """
-    adata.obs[new_batch_key] = adata.obs[batch_keys].astype(str).sum(1).astype("category")
+    adata.obs[new_batch_key] = (
+        adata.obs[batch_keys].astype(str).sum(1).astype("category")
+    )
 
 
 def calculate_depths(g):
@@ -142,7 +148,9 @@ def make_ontology_dag(obofile, lowercase=False):
     """
     co = obonet.read_obo(obofile, encoding="utf-8")
     id_to_name = {id_: data.get("name") for id_, data in co.nodes(data=True)}
-    name_to_id = {data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)}
+    name_to_id = {
+        data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)
+    }
 
     # get all node ids that are celltypes (start with CL)
     cl_ids = {id_: True for _, id_ in name_to_id.items() if id_.startswith("CL:")}
@@ -160,7 +168,11 @@ def make_ontology_dag(obofile, lowercase=False):
     for node in co.nodes():
         if node in cl_ids:
             for child, parent, key in co.out_edges(node, keys=True):
-                if child.startswith("CL:") and parent.startswith("CL:") and key == "is_a":
+                if (
+                    child.startswith("CL:")
+                    and parent.startswith("CL:")
+                    and key == "is_a"
+                ):
                     childname = id_to_name[child]
                     parentname = id_to_name[parent]
                     g.add_edge(childname, parentname, key=key)

diff --git a/popv/algorithms/_bbknn.py b/popv/algorithms/_bbknn.py
@@ -86,7 +86,9 @@ def predict(self, adata):
             ]
         )
         if smallest_neighbor_graph < 15:
-            logging.warning(f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN.")
+            logging.warning(
+                f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN."
+            )
             self.classifier_dict["n_neighbors"] = smallest_neighbor_graph
 
         knn = KNeighborsClassifier(metric="precomputed", **self.classifier_dict)
@@ -95,9 +97,15 @@ def predict(self, adata):
         adata.obs[self.result_key] = knn.predict(test_distances)
 
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(test_distances), axis=1)
+            adata.obs[self.result_key + "_probabilities"] = np.max(
+                knn.predict_proba(test_distances), axis=1
+            )
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]')
-            adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
+            logging.info(
+                f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]'
+            )
+            adata.obsm[self.embedding_key] = sc.tl.umap(
+                adata, copy=True, **self.embedding_dict
+            ).obsm["X_umap"]
diff --git a/popv/algorithms/_celltypist.py b/popv/algorithms/_celltypist.py
@@ -63,12 +63,16 @@ def predict(self, adata):
             **self.classifier_dict,
         )
         out_column = (
-            "majority_voting" if "majority_voting" in predictions.predicted_labels.columns else "predicted_labels"
+            "majority_voting"
+            if "majority_voting" in predictions.predicted_labels.columns
+            else "predicted_labels"
         )
 
         adata.obs[self.result_key] = predictions.predicted_labels[out_column]
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = predictions.probability_matrix.max(axis=1).values
+            adata.obs[
+                self.result_key + "_probabilities"
+            ] = predictions.probability_matrix.max(axis=1).values
 
     def compute_embedding(self, adata):
         pass
diff --git a/popv/algorithms/_harmony.py b/popv/algorithms/_harmony.py
@@ -61,7 +61,9 @@ def __init__(
     def compute_integration(self, adata):
         logging.info("Integrating data with harmony")
 
-        adata.obsm["X_pca_harmony"] = harmonize(adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key)
+        adata.obsm["X_pca_harmony"] = harmonize(
+            adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key
+        )
 
     def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
         logging.info(f'Saving knn on harmony results to adata.obs["{result_key}"]')
@@ -75,7 +77,9 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
                 n_neighbors=self.classifier_dict["n_neighbors"],
                 parallel_batch_queries=True,
             ),
-            KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]),
+            KNeighborsClassifier(
+                metric="precomputed", weights=self.classifier_dict["weights"]
+            ),
         )
 
         knn.fit(train_X, train_Y)
@@ -91,6 +95,10 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]')
+            logging.info(
+                f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]'
+            )
             sc.pp.neighbors(adata, use_rep="X_pca_harmony")
-            adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
+            adata.obsm[self.embedding_key] = sc.tl.umap(
+                adata, copy=True, **self.embedding_dict
+            ).obsm["X_umap"]
diff --git a/popv/algorithms/_onclass.py b/popv/algorithms/_onclass.py
@@ -106,10 +106,12 @@ def compute_integration(self, adata):
         pass
 
     def predict(self, adata):
-        logging.info(f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]')
-        adata.obs.loc[adata.obs["_dataset"] == "query", self.cell_ontology_obs_key] = adata.uns[
-            "unknown_celltype_label"
-        ]
+        logging.info(
+            f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]'
+        )
+        adata.obs.loc[
+            adata.obs["_dataset"] == "query", self.cell_ontology_obs_key
+        ] = adata.uns["unknown_celltype_label"]
 
         train_idx = adata.obs["_dataset"] == "ref"
 
@@ -127,10 +129,14 @@ def predict(self, adata):
         cl_ontology_file = adata.uns["_cl_ontology_file"]
         nlp_emb_file = adata.uns["_nlp_emb_file"]
 
-        celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(cl_obo_file)
+        celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(
+            cl_obo_file
+        )
         self.make_cell_ontology_id(adata, celltype_dict, self.cell_ontology_obs_key)
 
-        train_model = OnClassModel(cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file)
+        train_model = OnClassModel(
+            cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file
+        )
 
         if adata.uns["_save_path_trained_models"] is not None:
             model_path = adata.uns["_save_path_trained_models"] + "/OnClass"
@@ -175,13 +181,17 @@ def predict(self, adata):
         )
 
         if adata.uns["_prediction_mode"] == "fast":
-            onclass_seen = np.argmax(train_model.model.predict(corr_test_feature), axis=1)
+            onclass_seen = np.argmax(
+                train_model.model.predict(corr_test_feature), axis=1
+            )
             pred_label = [train_model.i2co[ind] for ind in onclass_seen]
             pred_label_str = [clid_2_name[ind] for ind in pred_label]
             adata.obs[self.result_key] = pred_label_str
             adata.obs[self.seen_result_key] = pred_label_str
         else:
-            onclass_pred = train_model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0)
+            onclass_pred = train_model.Predict(
+                corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0
+            )
             pred_label = [train_model.i2co[ind] for ind in onclass_pred[2]]
             pred_label_str = [clid_2_name[ind] for ind in pred_label]
             adata.obs[self.result_key] = pred_label_str
@@ -192,9 +202,15 @@ def predict(self, adata):
             adata.obs[self.seen_result_key] = pred_label_str
 
             if adata.uns["_return_probabilities"]:
-                adata.obs[self.result_key + "_probabilities"] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
-                adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[1].sum(1, keepdims=True)
-                adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(onclass_pred[0], axis=1)
+                adata.obs[self.result_key + "_probabilities"] = np.max(
+                    onclass_pred[1], axis=1
+                ) / onclass_pred[1].sum(1)
+                adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[
+                    1
+                ].sum(1, keepdims=True)
+                adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(
+                    onclass_pred[0], axis=1
+                )
 
     def compute_embedding(self, adata):
         return None
diff --git a/popv/algorithms/_rf.py b/popv/algorithms/_rf.py
@@ -48,13 +48,19 @@ def compute_integration(self, adata):
         pass
 
     def predict(self, adata):
-        logging.info(f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]')
+        logging.info(
+            f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]'
+        )
 
         test_x = adata.layers[self.layers_key] if self.layers_key else adata.X
 
         if adata.uns["_prediction_mode"] == "retrain":
             train_idx = adata.obs["_ref_subsample"]
-            train_x = adata[train_idx].layers[self.layers_key] if self.layers_key else adata[train_idx].X
+            train_x = (
+                adata[train_idx].layers[self.layers_key]
+                if self.layers_key
+                else adata[train_idx].X
+            )
             train_y = adata[train_idx].obs[self.labels_key].to_numpy()
             rf = RandomForestClassifier(**self.classifier_dict)
             rf.fit(train_x, train_y)
@@ -67,10 +73,14 @@ def predict(self, adata):
                     ),
                 )
         else:
-            rf = pickle.load(open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb"))
+            rf = pickle.load(
+                open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb")
+            )
         adata.obs[self.result_key] = rf.predict(test_x)
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = np.max(rf.predict_proba(test_x), axis=1)
+            adata.obs[self.result_key + "_probabilities"] = np.max(
+                rf.predict_proba(test_x), axis=1
+            )
 
     def compute_embedding(self, adata):
         pass
diff --git a/popv/algorithms/_scaffold_algorithm.py b/popv/algorithms/_scaffold_algorithm.py
@@ -69,12 +69,16 @@ def compute_integration(self, adata):
         # adata.obsm["X_new_method"] = embedded_data
 
     def predict(self, adata):
-        logging.info(f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]')
+        logging.info(
+            f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]'
+        )
         # adata.obs[self.result_key] = classifier_results
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]')
+            logging.info(
+                f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]'
+            )
             # sc.pp.neighbors(adata, use_rep="embedding_space")
             # adata.obsm[self.embedding_key] = sc.tl.umap(
             #     adata, copy=True, **self.embedding_dict

diff --git a/popv/algorithms/_scanorama.py b/popv/algorithms/_scanorama.py
@@ -62,7 +62,10 @@ def __init__(
     def compute_integration(self, adata):
         logging.info("Integrating data with scanorama")
 
-        _adatas = [adata[adata.obs[self.batch_key] == i] for i in np.unique(adata.obs[self.batch_key])]
+        _adatas = [
+            adata[adata.obs[self.batch_key] == i]
+            for i in np.unique(adata.obs[self.batch_key])
+        ]
         scanorama.integrate_scanpy(_adatas, **self.method_dict)
         tmp_adata = anndata.concat(_adatas)
         adata.obsm["X_scanorama"] = tmp_adata[adata.obs_names].obsm["X_scanorama"]
@@ -79,7 +82,9 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"):
                 n_neighbors=self.classifier_dict["n_neighbors"],
                 parallel_batch_queries=True,
             ),
-            KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]),
+            KNeighborsClassifier(
+                metric="precomputed", weights=self.classifier_dict["weights"]
+            ),
         )
 
         knn.fit(train_X, train_Y)
@@ -89,10 +94,16 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"):
         adata.obs[result_key] = knn_pred
 
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(adata.obsm["X_scanorama"]), axis=1)
+            adata.obs[self.result_key + "_probabilities"] = np.max(
+                knn.predict_proba(adata.obsm["X_scanorama"]), axis=1
+            )
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]')
+            logging.info(
+                f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]'
+            )
             sc.pp.neighbors(adata, use_rep="X_scanorama")
-            adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
+            adata.obsm[self.embedding_key] = sc.tl.umap(
+                adata, copy=True, **self.embedding_dict
+            ).obsm["X_umap"]