Skip to content

Commit

Permalink
Merge pull request #37 from cbg-ethz/revisions
Browse files Browse the repository at this point in the history
Revisions of VILOCA manuscript
  • Loading branch information
LaraFuhrmann authored Sep 6, 2024
2 parents bb554a7 + 7551c1a commit 65e6058
Show file tree
Hide file tree
Showing 10 changed files with 45 additions and 18 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ data. It is designed to analyse genetically heterogeneous samples. Its tools
are written in different programming languages and provide error correction,
haplotype reconstruction and estimation of the frequency of the different
genetic variants present in a mixed sample.
VILOCA takes an alignment file as input, and subsequently generates mutation calls and local haplotypes.


The corresponding manuscript can be found here: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1

Expand Down Expand Up @@ -62,6 +64,13 @@ There are several parameters available:

`--windowsize`: In case no insert file is provided, the genome is tiled into uniform local regions. `windowsize` determines the length of those local regions. It should be of roughly the length of the reads. This is also the length of the haplotypes that are produced.

### Output
`haplotypes` This directory contains the reconstructed local haplotypes as separate fasta files per local region.

`coverage.txt` List of each local region with start and end positions, and number of reads considered in the region.

`cooccurring_mutations.csv` All mutation calls including the information on which haplotype it occurred on.

## Development/CI with Docker
The following command in the root directory will let you interact with the project locally through Docker.
```bash
Expand Down
4 changes: 2 additions & 2 deletions tests/test_long_deletions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def test_long_deletions():

# Input data
bamfile = "test_aln.cram"
snvsfile = "snv/SNV.tsv"
outfile = "snv/SNVs_0.010000.tsv"
snvsfile = "work/snv/SNV.tsv"
outfile = "work/snv/SNVs_0.010000.tsv"

helper_long_deletions.main(bamfile=bamfile, snvsfile=snvsfile, outfile=outfile)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_pooled_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def open_side_effect(name):

def test__ingest_sampler_results_gamma_theta():
with patch("builtins.open", mock_open(read_data="bla\n#gamma = 0.967662\n\nlolz\n#theta = 0.88\neof")):
out = pooled_post._ingest_sampler_results_gamma_theta(open("debug/dbg.dbg"), "shorah")
out = pooled_post._ingest_sampler_results_gamma_theta(open("work/debug/dbg.dbg"), "shorah")

np.testing.assert_almost_equal(out[0][0], -0.032872426234558716)
np.testing.assert_almost_equal(out[0][1], -3.431512269664515)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_shotgun_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import shutil

cwd = "./data_1"
f_path = "raw_reads/w-HXB2-2804-3004.extended-ref.fas"
f_path = "work/raw_reads/w-HXB2-2804-3004.extended-ref.fas"
base_files = []

@pytest.fixture(scope="session", autouse=True)
Expand Down Expand Up @@ -35,7 +35,7 @@ def test_e2e_shorah():

assert filecmp.cmp(
"./data_1/test.csv",
"./data_1/snv/SNVs_0.010000_final.csv",
"./data_1/work/snv/SNVs_0.010000_final.csv",
shallow=False
)

Expand Down
2 changes: 2 additions & 0 deletions viloca/b2w.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
reference_filename=reference_filename,
threads=max_proc #1
)
# print this message since pysam print confusing error message with the functions above.
print("Index file was created successfully.")
#reffile = pysam.FastaFile(reference_filename) --> we need to read it in each child processo
#counter = 0 #--> counter is now coputed initially for all windows
reference_name = tiling_strategy.get_reference_name()
Expand Down
2 changes: 1 addition & 1 deletion viloca/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def main():

parser_shotgun.add_argument("--n_max_haplotypes", metavar='INT', type=int,
required=False, default=100, dest="n_max_haplotypes",
help="Guess of maximal guess of haplotypes.")
help="Guess of maximal guess of haplotypes. If VILOCA returns the maximal number of haplotypes then this number was choosen to little and needs to be increased.")

parser_shotgun.add_argument("--conv_thres", metavar='FLOAT', type=float,
required=False, default=1e-03, dest="conv_thres",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import cavi

logging.basicConfig(
filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
filename="viloca.log", encoding="utf-8", level=logging.INFO
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from . import cavi

logging.basicConfig(
filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
filename="viloca.log", encoding="utf-8", level=logging.INFO
)


Expand Down
2 changes: 1 addition & 1 deletion viloca/shorah_snv.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def parseWindow(line, extended_window_mode, exclude_non_var_pos_threshold,
_, chrom, beg, end, _ = line.rstrip().split("\t")

file_stem = "w-%s-%s-%s" % (chrom, beg, end)
haplo_filename = os.path.join(working_dir, "support", file_stem + ".reads-support.fas")
haplo_filename = os.path.join(working_dir, "haplotypes", file_stem + ".reads-support.fas")

if extended_window_mode:
ref_name = "extended-ref"
Expand Down
34 changes: 25 additions & 9 deletions viloca/shotgun.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def main(args):
logging.debug("[shotgun] All processes completed successfully.")

# prepare directories
for sd_name in ['debug', 'sampling', 'freq', 'support',
for sd_name in ['debug', 'haplotypes', 'freq', 'sampling', 'work',
'corrected', 'raw_reads', 'inference']:
try:
os.mkdir(sd_name)
Expand Down Expand Up @@ -557,7 +557,7 @@ def main(args):
move_files_into_dir("debug", glob.glob("./w*dbg"))
move_files_into_dir("sampling", glob.glob("./w*smp"))
move_files_into_dir("corrected", glob.glob("./w*reads-cor.fas"))
move_files_into_dir("support", glob.glob("./w*reads-support.fas"))
move_files_into_dir("haplotypes", glob.glob("./w*reads-support.fas"))
move_files_into_dir("freq", glob.glob("./w*reads-freq.csv"))
move_files_into_dir("sampling", glob.glob("./w*smp"))
raw_reads_files = glob.glob('./w*reads.fas') + glob.glob('./w*ref.fas') + glob.glob('./w*qualities.npy')
Expand Down Expand Up @@ -628,15 +628,15 @@ def main(args):
envp_post.post_process_for_envp(
open(f"raw_reads/{stem}.envp-full-ref.fas"),
open(f"raw_reads/{stem}.envp-ref.fas"),
f"support/{stem}.reads-support.fas",
f"support/{stem}.reads-support.fas" # overwrite
f"haplotypes/{stem}.reads-support.fas",
f"haplotypes/{stem}.reads-support.fas" # overwrite
)

# Pooled
b_list = args.b.copy()
if len(b_list) > 1:
for idx, i in enumerate(b_list):
Path(f"sample{idx}/support").mkdir(parents=True, exist_ok=True)
Path(f"sample{idx}/haplotypes").mkdir(parents=True, exist_ok=True)
Path(f"sample{idx}/corrected").mkdir(parents=True, exist_ok=True)
with open("coverage.txt") as cov:
for line in cov:
Expand All @@ -654,16 +654,16 @@ def main(args):
f"raw_reads/{stem}.ref.fas",
filtered_reads.name,
open(f"debug/{stem}.dbg") if inference_type == "shorah" else open(f"inference/{stem}.reads-all_results.pkl", "rb"),
open(f"support/{stem}.reads-support.fas"),
open(f"haplotypes/{stem}.reads-support.fas"),
filtered_cor_reads_path,
inference_type,
None if inference_type != "use_quality_scores" else f"raw_reads/{stem}.qualities.npy" # TODO untested
)
filtered_reads.close()

pooled_post.write_support_file_per_sample(
open(f"support/{stem}.reads-support.fas"),
open(f"sample{idx}/support/{stem}.reads-support.fas", "w+"), # TODO
open(f"haplotypes/{stem}.reads-support.fas"),
open(f"sample{idx}/haplotypes/{stem}.reads-support.fas", "w+"), # TODO
*posterior_and_avg
)

Expand All @@ -682,7 +682,23 @@ def main(args):
os.rename('snv', 'snv_before_%d' % int(time.time()))
os.mkdir('snv')

for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*')+ glob.glob('./cooccurring_mutations.csv'):
for snv_file in glob.glob('./SNV*_final*')+ glob.glob('./cooccurring_mutations.csv'):
shutil.copy(snv_file, 'snv/')
for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*.tsv'):
shutil.move(snv_file, 'snv/')

# now move all files that are not directly results into the debug directory
shutil.move("inference", "work")
shutil.move("raw_reads", "work")
shutil.move("sampling", "work")
shutil.move("debug", "work")
shutil.move("freq", "work")
shutil.move("corrected", "work")
shutil.move("reads.fas", "work")
shutil.move("proposed.dat", "work")
shutil.move("snv", "work")
shutil.move(glob.glob('*.cor.fas')[0], "work")

logging.info('shotgun run ends')
logging.info('VILOCA terminated')
print("VILOCA terminated successfully.")

0 comments on commit 65e6058

Please sign in to comment.