Merge pull request #11 from shahcompbio/dev

Dev
shahcompbio · Nov 6, 2023 · bde509b · bde509b
2 parents 7b834f5 + e1ba28f
commit bde509b
Show file tree

Hide file tree

Showing 11 changed files with 686 additions and 328 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -24,11 +24,12 @@ COPY . /app
 # RUN /bin/bash -c "source venv/bin/activate"
 
 # Install dependencies
+RUN pip install --upgrade pip setuptools
 RUN pip install numpy==1.21.4 cython==0.29.22
 RUN pip install --no-cache-dir -r requirements4.txt
 
 # Install the package in development mode
-RUN python setup.py develop
+RUN pip install .
 
 # Run unit tests
 RUN pip install pytest pytest-cov
@@ -38,4 +39,4 @@ RUN pytest test_with_pytest.py --doctest-modules
 # EXPOSE <port_number>
 
 # Run any required commands or scripts to start the application
-# CMD <command>
+# CMD <command>
diff --git a/README.md b/README.md
@@ -5,6 +5,12 @@ Method for probabilistic estimation of replication timing (PERT) from single-cel
 
 It is recommended that you use the docker image to run PERT. To do so, use [docker](https://docs.docker.com/get-docker/) or [singularity](https://docs.sylabs.io/guides/2.6/user-guide/singularity_and_docker.html) to pull the following [docker image](https://hub.docker.com/r/adamcweiner/scdna_replication_tools). This docker image contains all the necessary dependencies to run PERT and is automatically updated with the latest version of `main` using Github Actions.
 
+When pulling the docker image, be sure to specify the `main` tag as its omission will produce an error.
+
+```
+docker pull adamcweiner/scdna_replication_tools:main
+```
+
 If you do not wish to use the docker container, you can set up a conda environment with the correct python version and use pip to install all the requirements in a virtual environment:
 
 ```
@@ -49,7 +55,9 @@ The main output when running PERT for scRT inference is a pandas dataframe with
 `model_rep_state`: the estimated replication state for each bin. This is a binary variable between 0 and 1, with 0 indicating the bin is unreplicated and 1 indicating the bin replicated.
 `model_cn_state`: the estimated somatic copy number for each bin. These will be integer values ranging from 0-11 (same domain as input `state`). 
 
-While there are other columns in the output dataframe, these are the most important for downstream analysis. These columns are used for downstream computation of pseudobulk replication timing profiles, each cell's fraction of replicated bins, cell cycle phase predictions, and a sample's time from scheduled replication (T-width). Other output columns from pert_model.py correspond to the name of different latent variables in the graphical model (see paper for details).
+While there are other columns in the output dataframe, these are the most important for downstream analysis. Other output columns from pert_model.py correspond to the name of different latent variables in the graphical model (see paper for details). The output of this dataframe must then be passed into [`predict_cell_cycle_phase()`](https://github.com/shahcompbio/scdna_replication_tools/blob/main/scdna_replication_tools/predict_cycle_phase.py) to predict the phase of each cell. 
+
+We caution against directly using `model_rho` and `model_tau` for analysis of a loci's replication timing or cell's S-phase time as the only thing that matters is their relative value to one another within each PERT run. For instance, for the same sample you can get rho and tau values that all lie between 0.1-0.2 in one run and values between 0.4-0.9 but as long as the relative ordering of rho and tau values are the same, all the replication (`model_rep_state`) and somatic copy number (`model_cn_state`) states should be the same. Because of this phenomenon, users wishing to compute cell times within S-phase should access the `cell_frac_rep` column that gets produced after [predicting the revised cell cycle phases](https://github.com/shahcompbio/scdna_replication_tools/blob/main/scdna_replication_tools/predict_cycle_phase.py). Similarly, users wishing to compute replication timing profiles should pass the predicted S-phase cells into the [`compute_pseudobulk_rt_profiles()` function](https://github.com/shahcompbio/scdna_replication_tools/blob/main/scdna_replication_tools/compute_pseudobulk_rt_profiles.py).
 
 
 ## Feedback

diff --git a/notebooks/inference_tutorial.ipynb b/notebooks/inference_tutorial.ipynb
diff --git a/notebooks/inference_tutorial_pt2.ipynb b/notebooks/inference_tutorial_pt2.ipynb
diff --git a/notebooks/simulator_tutorial.ipynb b/notebooks/simulator_tutorial.ipynb
diff --git a/requirements4.txt b/requirements4.txt
@@ -59,11 +59,11 @@ funcsigs==1.0.2
 funsor==0.4.3
 graphviz==0.20
 h5py==3.6.0
-hdbscan==0.8.27
+hdbscan==0.8.33
 hmmlearn==0.2.5
 idna==2.8
 imagesize==1.2.0
-importlib-metadata==3.7.3
+importlib-metadata==4.4
 iniconfig==1.1.1
 ipaddress==1.0.22
 ipykernel==5.5.0
@@ -73,9 +73,10 @@ ipywidgets==7.6.3
 isodate==0.6.0
 isort==4.3.17
 itypes==1.1.0
-jax==0.3.13
+# jax==0.3.25
 # jaxlib==0.3.10
-jaxlib==0.3.5
+# jaxlib==0.3.5
+# jaxlib==0.3.25
 jedi==0.15.1
 Jinja2==2.10.3
 jira==2.0.0
@@ -117,7 +118,7 @@ notebook==6.2.0
 numba==0.53.0
 numexpr==2.7.3
 numpy==1.21.4
-numpyro==0.10.0
+# numpyro==0.10.0
 oauthlib==3.1.0
 openapi-codec==1.3.2
 opt-einsum==3.3.0
@@ -140,7 +141,7 @@ prompt-toolkit==2.0.10
 ptyprocess==0.6.0
 py==1.10.0
 pyasn1==0.4.5
-pyBigWig==0.3.18
+pyBigWig==0.3.22
 pycparser==2.19
 pyfaidx==0.6.4
 Pygments==2.4.2
@@ -183,8 +184,8 @@ singledispatch==3.4.0.3
 -e git+https://github.com/shahcompbio/sisyphus.git@78c708b1dcf93f85e66b5faaafff581e56880c88#egg=sisyphus
 six==1.13.0
 snowballstemmer==2.1.0
-sorted-nearest==0.0.33
-Sphinx==1.8.5
+sorted-nearest==0.0.39
+Sphinx==5.1.1
 sphinxcontrib-serializinghtml==1.1.5
 sphinxcontrib-websupport==1.2.4
 statannot==0.2.3

diff --git a/scdna_replication_tools/compute_pseudobulk_rt_profiles.py b/scdna_replication_tools/compute_pseudobulk_rt_profiles.py
@@ -38,9 +38,9 @@ def calc_population_rt(cn, input_col, output_col, time_col='rt_hours', chr_col='
     return pop_cn
 
 
-def compute_pseudobulk_rt_profiles(cn, input_col, output_col='pseduobulk', time_col='hours', clone_col='clone_id', chr_col='chr', start_col='start'):
+def compute_pseudobulk_rt_profiles(cn, input_col, output_col='pseudobulk', time_col='hours', clone_col='clone_id', chr_col='chr', start_col='start'):
     '''
-    Compute population- and clone-level pseduobulk replication timing profiles
+    Compute population- and clone-level pseudobulk replication timing profiles
 
     Args:
         cn: dataframe where rows represent unique segments from all cells, columns contain genomic loci and input_col
@@ -56,7 +56,7 @@ def compute_pseudobulk_rt_profiles(cn, input_col, output_col='pseduobulk', time_
     bulk_cn = calc_population_rt(cn, input_col=input_col, output_col=temp_output_col, time_col=temp_time_col, chr_col='chr', start_col='start')
 
     if clone_col is not None:
-        for clone_id, clone_cn in cn.groupby('clone_id'):
+        for clone_id, clone_cn in cn.groupby(clone_col):
             # compute pseudobulk for this clone
             temp_output_col = '{}_clone{}_{}'.format(output_col, clone_id, input_col)
             temp_time_col = '{}_clone{}_{}'.format(output_col, clone_id, time_col)
@@ -74,7 +74,7 @@ def main():
 
     cn = pd.read_csv(argv.input, sep='\t')
 
-    bulk_cn = compute_pseudobulk_rt_profiles(cn, argv.column, output_col='pseduobulk', time_col='hours', clone_col='clone_id')
+    bulk_cn = compute_pseudobulk_rt_profiles(cn, argv.column, output_col='pseudobulk', time_col='hours', clone_col='clone_id')
 
     bulk_cn.to_csv(argv.output, sep='\t', index=False)
 

diff --git a/scdna_replication_tools/infer_scRT.py b/scdna_replication_tools/infer_scRT.py
@@ -28,7 +28,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', assign_col='copy', library_co
                  rv_col='rt_value', rs_col='rt_state', frac_rt_col='frac_rt', clone_col='clone_id', rt_prior_col='mcf7rt',
                  cn_prior_method='hmmcopy', col2='rpm_gc_norm', col3='temp_rt', col4='changepoint_segments', col5='binary_thresh',
                  max_iter=2000, min_iter=100, max_iter_step1=None, min_iter_step1=None, max_iter_step3=None, min_iter_step3=None,
-                 learning_rate=0.05, rel_tol=1e-6, cuda=False, seed=0, P=13, K=4, upsilon=6, run_step3=True):
+                 cn_prior_weight=1e6, learning_rate=0.05, rel_tol=1e-6, cuda=False, seed=0, P=13, K=4, upsilon=6, run_step3=True):
         self.cn_s = cn_s
         self.cn_g1 = cn_g1
 
@@ -73,6 +73,7 @@ def __init__(self, cn_s, cn_g1, input_col='reads', assign_col='copy', library_co
 
         # class objects that are specific to the pyro model
         self.cn_prior_method = cn_prior_method
+        self.cn_prior_weight = cn_prior_weight
         self.learning_rate = learning_rate
         self.max_iter = max_iter
         self.min_iter = min_iter
@@ -153,7 +154,7 @@ def infer_pert_model(self):
         pert_model = pert_infer_scRT(self.cn_s, self.cn_g1, input_col=self.input_col, gc_col=self.gc_col, rt_prior_col=self.rt_prior_col,
                                      clone_col=self.clone_col, cell_col=self.cell_col, library_col=self.library_col, assign_col=self.assign_col,
                                      chr_col=self.chr_col, start_col=self.start_col, cn_state_col=self.cn_state_col,
-                                     rs_col=self.rs_col, frac_rt_col=self.frac_rt_col, cn_prior_method=self.cn_prior_method,
+                                     rs_col=self.rs_col, frac_rt_col=self.frac_rt_col, cn_prior_method=self.cn_prior_method, cn_prior_weight=self.cn_prior_weight,
                                      learning_rate=self.learning_rate, max_iter=self.max_iter, min_iter=self.min_iter, rel_tol=self.rel_tol,
                                      min_iter_step1=self.min_iter_step1, min_iter_step3=self.min_iter_step3, max_iter_step1=self.max_iter_step1, max_iter_step3=self.max_iter_step3,
                                      cuda=self.cuda, seed=self.seed, P=self.P, K=self.K, upsilon=self.upsilon, run_step3=self.run_step3)