Merge pull request #37 from cbg-ethz/revisions

Revisions of VILOCA manuscript
cbg-ethz · Sep 6, 2024 · 65e6058 · 65e6058
2 parents bb554a7 + 7551c1a
commit 65e6058
Show file tree

Hide file tree

Showing 10 changed files with 45 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ data. It is designed to analyse genetically heterogeneous samples. Its tools
 are written in different programming languages and provide error correction,
 haplotype reconstruction and estimation of the frequency of the different
 genetic variants present in a mixed sample.
+VILOCA takes an alignment file as input, and subsequently generates mutation calls and local haplotypes.
+
 
 The corresponding manuscript can be found here: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1
 
@@ -62,6 +64,13 @@ There are several parameters available:
 
 `--windowsize`: In case no insert file is provided, the genome is tiled into uniform local regions. `windowsize` determines the length of those local regions. It should be of roughly the length of the reads. This is also the length of the haplotypes that are produced.
 
+### Output
+`haplotypes` This directory contains the reconstructed local haplotypes as separate fasta files per local region.
+
+`coverage.txt` List of each local region with start and end positions, and number of reads considered in the region.
+
+`cooccurring_mutations.csv` All mutation calls including the information on which haplotype it occurred on.
+
 ## Development/CI with Docker
 The following command in the root directory will let you interact with the project locally through Docker.
 ```bash

diff --git a/tests/test_long_deletions.py b/tests/test_long_deletions.py
@@ -16,8 +16,8 @@ def test_long_deletions():
 
     # Input data
     bamfile = "test_aln.cram"
-    snvsfile = "snv/SNV.tsv"
-    outfile = "snv/SNVs_0.010000.tsv"
+    snvsfile = "work/snv/SNV.tsv"
+    outfile = "work/snv/SNVs_0.010000.tsv"
 
     helper_long_deletions.main(bamfile=bamfile, snvsfile=snvsfile, outfile=outfile)
 

diff --git a/tests/test_pooled_post.py b/tests/test_pooled_post.py
@@ -24,7 +24,7 @@ def open_side_effect(name):
 
 def test__ingest_sampler_results_gamma_theta():
     with patch("builtins.open", mock_open(read_data="bla\n#gamma = 0.967662\n\nlolz\n#theta = 0.88\neof")):
-        out = pooled_post._ingest_sampler_results_gamma_theta(open("debug/dbg.dbg"), "shorah")
+        out = pooled_post._ingest_sampler_results_gamma_theta(open("work/debug/dbg.dbg"), "shorah")
 
         np.testing.assert_almost_equal(out[0][0], -0.032872426234558716)
         np.testing.assert_almost_equal(out[0][1], -3.431512269664515)

diff --git a/tests/test_shotgun_e2e.py b/tests/test_shotgun_e2e.py
@@ -5,7 +5,7 @@
 import shutil
 
 cwd = "./data_1"
-f_path = "raw_reads/w-HXB2-2804-3004.extended-ref.fas"
+f_path = "work/raw_reads/w-HXB2-2804-3004.extended-ref.fas"
 base_files = []
 
 @pytest.fixture(scope="session", autouse=True)
@@ -35,7 +35,7 @@ def test_e2e_shorah():
 
     assert filecmp.cmp(
         "./data_1/test.csv",
-        "./data_1/snv/SNVs_0.010000_final.csv",
+        "./data_1/work/snv/SNVs_0.010000_final.csv",
         shallow=False
     )
 

diff --git a/viloca/b2w.py b/viloca/b2w.py
@@ -555,6 +555,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
         reference_filename=reference_filename,
         threads=max_proc #1
     )
+    # print this message since pysam print confusing error message with the functions above.
+    print("Index file was created successfully.")
     #reffile = pysam.FastaFile(reference_filename) --> we need to read it in each child processo
     #counter = 0 #--> counter is now coputed initially for all windows
     reference_name = tiling_strategy.get_reference_name()

diff --git a/viloca/cli.py b/viloca/cli.py
@@ -178,7 +178,7 @@ def main():
 
     parser_shotgun.add_argument("--n_max_haplotypes", metavar='INT', type=int,
                                 required=False, default=100, dest="n_max_haplotypes",
-                                help="Guess of maximal guess of haplotypes.")
+                                help="Guess of maximal guess of haplotypes. If VILOCA returns the maximal number of haplotypes then this number was choosen to little and needs to be increased.")
 
     parser_shotgun.add_argument("--conv_thres", metavar='FLOAT', type=float,
                             required=False, default=1e-03, dest="conv_thres",

diff --git a/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py b/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py
@@ -11,7 +11,7 @@
 from . import cavi
 
 logging.basicConfig(
-    filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
+    filename="viloca.log", encoding="utf-8", level=logging.INFO
 )
 
 

diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
@@ -13,7 +13,7 @@
 from . import cavi
 
 logging.basicConfig(
-    filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
+    filename="viloca.log", encoding="utf-8", level=logging.INFO
 )
 
 

diff --git a/viloca/shorah_snv.py b/viloca/shorah_snv.py
@@ -203,7 +203,7 @@ def parseWindow(line, extended_window_mode, exclude_non_var_pos_threshold,
     _, chrom, beg, end, _ = line.rstrip().split("\t")
 
     file_stem = "w-%s-%s-%s" % (chrom, beg, end)
-    haplo_filename = os.path.join(working_dir, "support", file_stem + ".reads-support.fas")
+    haplo_filename = os.path.join(working_dir, "haplotypes", file_stem + ".reads-support.fas")
 
     if extended_window_mode:
         ref_name = "extended-ref"

diff --git a/viloca/shotgun.py b/viloca/shotgun.py
@@ -523,7 +523,7 @@ def main(args):
     logging.debug("[shotgun] All processes completed successfully.")
 
     # prepare directories
-    for sd_name in ['debug', 'sampling', 'freq', 'support',
+    for sd_name in ['debug', 'haplotypes', 'freq', 'sampling', 'work',
                     'corrected', 'raw_reads', 'inference']:
         try:
             os.mkdir(sd_name)
@@ -557,7 +557,7 @@ def main(args):
     move_files_into_dir("debug", glob.glob("./w*dbg"))
     move_files_into_dir("sampling", glob.glob("./w*smp"))
     move_files_into_dir("corrected", glob.glob("./w*reads-cor.fas"))
-    move_files_into_dir("support", glob.glob("./w*reads-support.fas"))
+    move_files_into_dir("haplotypes", glob.glob("./w*reads-support.fas"))
     move_files_into_dir("freq", glob.glob("./w*reads-freq.csv"))
     move_files_into_dir("sampling", glob.glob("./w*smp"))
     raw_reads_files = glob.glob('./w*reads.fas') + glob.glob('./w*ref.fas') + glob.glob('./w*qualities.npy')
@@ -628,15 +628,15 @@ def main(args):
                 envp_post.post_process_for_envp(
                     open(f"raw_reads/{stem}.envp-full-ref.fas"),
                     open(f"raw_reads/{stem}.envp-ref.fas"),
-                    f"support/{stem}.reads-support.fas",
-                    f"support/{stem}.reads-support.fas" # overwrite
+                    f"haplotypes/{stem}.reads-support.fas",
+                    f"haplotypes/{stem}.reads-support.fas" # overwrite
                 )
 
     # Pooled
     b_list = args.b.copy()
     if len(b_list) > 1:
         for idx, i in enumerate(b_list):
-            Path(f"sample{idx}/support").mkdir(parents=True, exist_ok=True)
+            Path(f"sample{idx}/haplotypes").mkdir(parents=True, exist_ok=True)
             Path(f"sample{idx}/corrected").mkdir(parents=True, exist_ok=True)
             with open("coverage.txt") as cov:
                 for line in cov:
@@ -654,16 +654,16 @@ def main(args):
                         f"raw_reads/{stem}.ref.fas",
                         filtered_reads.name,
                         open(f"debug/{stem}.dbg") if inference_type == "shorah" else open(f"inference/{stem}.reads-all_results.pkl", "rb"),
-                        open(f"support/{stem}.reads-support.fas"),
+                        open(f"haplotypes/{stem}.reads-support.fas"),
                         filtered_cor_reads_path,
                         inference_type,
                         None if inference_type != "use_quality_scores" else f"raw_reads/{stem}.qualities.npy" # TODO untested
                     )
                     filtered_reads.close()
 
                     pooled_post.write_support_file_per_sample(
-                        open(f"support/{stem}.reads-support.fas"),
-                        open(f"sample{idx}/support/{stem}.reads-support.fas", "w+"), # TODO
+                        open(f"haplotypes/{stem}.reads-support.fas"),
+                        open(f"sample{idx}/haplotypes/{stem}.reads-support.fas", "w+"), # TODO
                         *posterior_and_avg
                     )
 
@@ -682,7 +682,23 @@ def main(args):
             os.rename('snv', 'snv_before_%d' % int(time.time()))
             os.mkdir('snv')
 
-        for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*')+ glob.glob('./cooccurring_mutations.csv'):
+        for snv_file in glob.glob('./SNV*_final*')+ glob.glob('./cooccurring_mutations.csv'):
+            shutil.copy(snv_file, 'snv/')
+        for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*.tsv'):
             shutil.move(snv_file, 'snv/')
 
+    # now move all files that are not directly results into the debug directory
+    shutil.move("inference", "work")
+    shutil.move("raw_reads", "work")
+    shutil.move("sampling", "work")
+    shutil.move("debug", "work")
+    shutil.move("freq", "work")
+    shutil.move("corrected", "work")
+    shutil.move("reads.fas", "work")
+    shutil.move("proposed.dat", "work")
+    shutil.move("snv", "work")
+    shutil.move(glob.glob('*.cor.fas')[0], "work")
+
     logging.info('shotgun run ends')
+    logging.info('VILOCA terminated')
+    print("VILOCA terminated successfully.")