add J as optional argument

shahcompbio · Dec 4, 2023 · dec6ffb · dec6ffb
1 parent bde509b
commit dec6ffb
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 5 deletions.
diff --git a/scdna_replication_tools/infer_scRT.py b/scdna_replication_tools/infer_scRT.py
@@ -28,7 +28,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', assign_col='copy', library_co
                  rv_col='rt_value', rs_col='rt_state', frac_rt_col='frac_rt', clone_col='clone_id', rt_prior_col='mcf7rt',
                  cn_prior_method='hmmcopy', col2='rpm_gc_norm', col3='temp_rt', col4='changepoint_segments', col5='binary_thresh',
                  max_iter=2000, min_iter=100, max_iter_step1=None, min_iter_step1=None, max_iter_step3=None, min_iter_step3=None,
-                 cn_prior_weight=1e6, learning_rate=0.05, rel_tol=1e-6, cuda=False, seed=0, P=13, K=4, upsilon=6, run_step3=True):
+                 cn_prior_weight=1e6, learning_rate=0.05, rel_tol=1e-6, cuda=False, seed=0, P=13, K=4, J=5, upsilon=6, run_step3=True):
         self.cn_s = cn_s
         self.cn_g1 = cn_g1
 
@@ -82,6 +82,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', assign_col='copy', library_co
         self.seed = seed
         self.P = P  # number of CN states
         self.K = K  # polynomial degree
+        self.J = J  # number of G1-phase cells to include in the composite CN prior
         self.upsilon = upsilon
         self.run_step3 = run_step3
 
@@ -157,7 +158,7 @@ def infer_pert_model(self):
                                      rs_col=self.rs_col, frac_rt_col=self.frac_rt_col, cn_prior_method=self.cn_prior_method, cn_prior_weight=self.cn_prior_weight,
                                      learning_rate=self.learning_rate, max_iter=self.max_iter, min_iter=self.min_iter, rel_tol=self.rel_tol,
                                      min_iter_step1=self.min_iter_step1, min_iter_step3=self.min_iter_step3, max_iter_step1=self.max_iter_step1, max_iter_step3=self.max_iter_step3,
-                                     cuda=self.cuda, seed=self.seed, P=self.P, K=self.K, upsilon=self.upsilon, run_step3=self.run_step3)
+                                     cuda=self.cuda, seed=self.seed, P=self.P, K=self.K, J=self.J, upsilon=self.upsilon, run_step3=self.run_step3)
 
         print('running PERT model')
         print('cn_s.dtypes', self.cn_s.dtypes, sep='\n')

diff --git a/scdna_replication_tools/pert_model.py b/scdna_replication_tools/pert_model.py
@@ -40,7 +40,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', gc_col='gc', rt_prior_col='mc
                  rs_col='rt_state', frac_rt_col='frac_rt', cn_prior_method='g1_composite',
                  cn_prior_weight=1e6, learning_rate=0.05, max_iter=2000, min_iter=100, rel_tol=1e-6,
                  max_iter_step1=None, min_iter_step1=None, max_iter_step3=None, min_iter_step3=None,
-                 cuda=False, seed=0, P=13, K=4, upsilon=6, run_step3=True):
+                 cuda=False, seed=0, P=13, K=4, J=5, upsilon=6, run_step3=True):
         '''
         initialise the pert_infer_scRT object
         :param cn_s: long-form dataframe containing copy number and read count information from S-phase cells. (pandas.DataFrame)
@@ -72,6 +72,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', gc_col='gc', rt_prior_col='mc
         :param P: number of integer copy number states to include in the model, values range from 0 to P-1. (int)
         :param L: number of libraries. (int)
         :param K: maximum polynomial degree allowed for the GC bias curve. (int)
+        :param J: number of G1-phase cells to use for each S-phase cell when building the cn prior concentration matrix. (int)
         :param upsilon: value that alpha and beta terms should sum to when creating a beta distribution prior for tau. (int)
         :param run_step3: Should the pretrained S-phase model be run on low variance cells to look for misclassifications. (bool)
         '''
@@ -123,6 +124,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', gc_col='gc', rt_prior_col='mc
         self.P = P
         self.L = None
         self.K = K
+        self.J = J
 
         self.upsilon = upsilon
         self.run_step3 = run_step3
@@ -294,14 +296,19 @@ def build_clone_cn_prior(self, cn, cn_df, cn_tensor, clone_cn_profiles):
         return etas
 
 
-    def build_composite_cn_prior(self, cn, clone_cn_profiles, weight=1e5, J=5):
+    def build_composite_cn_prior(self, cn, clone_cn_profiles, weight=1e5):
         """ 
         Build a cn prior concentration matrix that uses both the consensus G1 clone profile and the closest G1-phase cell profiles.
         J represents the number of matching G1-phase cells to use for each S-phase cell.
         """
-
+        J = self.J
         temp_cn_g1 = self.cn_g1.copy()
 
+        # reduce J if it is larger than the number of G1 cells in the smallest clone
+        smallest_clone_size = temp_cn_g1[[self.cell_col, self.clone_col]].drop_duplicates().groupby(self.clone_col).size().min()
+        if smallest_clone_size < J:
+            J = smallest_clone_size
+
         # remove G1 cells from certain clones that don't belong to the majority ploidy
         # i.e. remove tetraploid cells if clone is 90% diploid
         ploidy_col = 'ploidy'