diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b7b49b4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 David DeTomaso + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hotspot/hotspot.py b/hotspot/hotspot.py index f2e526f..a858f66 100644 --- a/hotspot/hotspot.py +++ b/hotspot/hotspot.py @@ -52,12 +52,6 @@ def __init__( If omitted, the sum over genes in the counts matrix is used """ - self.counts = counts - self.latent = latent - self.distances = distances - self.tree = tree - self.model = model - if latent is None and distances is None and tree is None: raise ValueError("Neither `latent` or `tree` or `distance` arguments were supplied. One of these is required") @@ -71,7 +65,10 @@ def __init__( raise ValueError("Both `distances` and `tree` provided - only one of these should be provided.") if latent is not None: - assert counts.shape[1] == latent.shape[0] + if counts.shape[1] != latent.shape[0]: + if counts.shape[0] == latent.shape[0]: + raise ValueError("`counts` input should be a Genes x Cells dataframe. Maybe needs transpose?") + raise ValueError("Size mismatch counts/latent. Columns of `counts` should match rows of `latent`.") if distances is not None: assert counts.shape[1] == distances.shape[0] @@ -106,6 +103,20 @@ def __init__( 'Input `model` should be one of {}'.format(valid_models) ) + valid_genes = counts.var(axis=1) > 0 + n_invalid = counts.shape[0] - valid_genes.sum() + if n_invalid > 0: + counts = counts.loc[valid_genes] + print( + "\nRemoving {} undetected/non-varying genes".format(n_invalid) + ) + + self.counts = counts + self.latent = latent + self.distances = distances + self.tree = tree + self.model = model + self.umi_counts = umi_counts self.graph = None diff --git a/hotspot/modules.py b/hotspot/modules.py index acfc1da..7159f88 100644 --- a/hotspot/modules.py +++ b/hotspot/modules.py @@ -337,8 +337,11 @@ def compute_modules(Z_scores, min_gene_threshold=10, fdr_threshold=None, z_thres allZ = np.sort(allZ) allP = norm.sf(allZ) allP_c = multipletests(allP, method='fdr_bh')[1] - ii = np.nonzero(allP_c < fdr_threshold)[0][0] - z_threshold = allZ[ii] + ii = np.nonzero(allP_c < fdr_threshold)[0] + if ii.size > 0: + z_threshold = allZ[ii[0]] + else: + z_threshold = allZ[-1]+1 # Compute the linkage matrix dd = Z_scores.copy().values diff --git a/tests/test_validations.py b/tests/test_validations.py new file mode 100644 index 0000000..a161dcb --- /dev/null +++ b/tests/test_validations.py @@ -0,0 +1,96 @@ +import numpy as np +import pandas as pd +from hotspot import sim_data +from hotspot import Hotspot + + +def test_models(): + """ + Ensure each model runs + """ + + # Simulate some data + N_CELLS = 100 + N_DIM = 10 + N_GENES = 10 + + latent = sim_data.sim_latent(N_CELLS, N_DIM) + latent = pd.DataFrame( + latent, + index=['Cell{}'.format(i+1) for i in range(N_CELLS)] + ) + + umi_counts = sim_data.sim_umi_counts(N_CELLS, 2000, 200) + umi_counts = pd.Series(umi_counts) + + gene_exp = np.random.rand(N_GENES, N_CELLS) + gene_exp = pd.DataFrame( + gene_exp, + index=['Gene{}'.format(i+1) for i in range(gene_exp.shape[0])], + columns=latent.index + ) + + for model in ['danb', 'bernoulli', 'normal', 'none']: + hs = Hotspot( + gene_exp, model=model, latent=latent, umi_counts=umi_counts + ) + hs.create_knn_graph(False, n_neighbors=30) + hs.compute_hotspot() + + assert isinstance(hs.results, pd.DataFrame) + assert hs.results.shape[0] == N_GENES + + hs.compute_autocorrelations() + + assert isinstance(hs.results, pd.DataFrame) + assert hs.results.shape[0] == N_GENES + + hs.compute_local_correlations(gene_exp.index) + + assert isinstance(hs.local_correlation_z, pd.DataFrame) + assert hs.local_correlation_z.shape[0] == N_GENES + assert hs.local_correlation_z.shape[1] == N_GENES + + hs.create_modules(min_gene_threshold=2, fdr_threshold=1) + + assert isinstance(hs.modules, pd.Series) + assert (hs.modules.index & gene_exp.index).size == N_GENES + + assert isinstance(hs.linkage, np.ndarray) + assert hs.linkage.shape == (N_GENES-1, 4) + + hs.calculate_module_scores() + + assert isinstance(hs.module_scores, pd.DataFrame) + assert (hs.module_scores.index == gene_exp.columns).all() + + +def test_filter_genes(): + """ + Ensure genes with no expression are pre-filtered + """ + # Simulate some data + N_CELLS = 100 + N_DIM = 10 + N_GENES = 10 + N_GENES_ZERO = 5 + + latent = sim_data.sim_latent(N_CELLS, N_DIM) + latent = pd.DataFrame(latent) + + umi_counts = sim_data.sim_umi_counts(N_CELLS, 2000, 200) + umi_counts = pd.Series(umi_counts) + + gene_exp = np.random.rand(N_GENES+N_GENES_ZERO, N_CELLS) + gene_exp[N_GENES:] = 0 + gene_exp = pd.DataFrame( + gene_exp, + index=['Gene{}'.format(i+1) for i in range(gene_exp.shape[0])], + columns=latent.index + ) + + hs = Hotspot( + gene_exp, model='normal', latent=latent, umi_counts=umi_counts + ) + + assert hs.counts.shape[0] == N_GENES