Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mason sim #2

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions src/run_mason.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""
Simulates short-read Illumina data from amplicon sequences using Mason.
"""

import argparse
from subprocess import call
import os
import sys

from utilities import compute_number_of_reads_to_simulate, get_fasta_length


def check_mason_path(mason_path=None):
if not os.path.isfile(mason_path):

raise Exception("Cannot find Mason. Specify an existing file path for "
" Mason.")

def run_mason(fasta, mason_path, output_prefix, read_length, coverage, circ_or_linear, nthreads, seed=None):
"""Simulates short-read data with Mason.

Args:
fasta: Path to FASTA file containing amplicon.
mason_path: Path to Mason executable.
output_prefix: Prefix for file path to write out simulated read data.
read_length: Read length for simulated Illumina data (e.g., 150).
coverage: Target coverage for amplicon.
circ_or_linear: Whether or not to simulate reads from a circular or
linear amplicon. (Note used currently.)
nthreads: Number of threads to utilize.
seed: Random seed to utilize.

Returns:
None. Call Mason to simulate data.
"""
fasta_length = get_fasta_length(fasta)
num_reads = str(compute_number_of_reads_to_simulate(coverage, fasta_length, read_length))

R1 = f"{output_prefix}_R1.fastq.gz"
R2 = f"{output_prefix}_R2.fastq.gz"
cmd = "{} -ir {} -n {} --illumina-read-length {} --seq-technology illumina --num-threads {} -o {} -or {}".format(
mason_path, fasta, num_reads, read_length, nthreads, R1, R2
)

if seed:
cmd+=(" --seed " + str(seed))

print(cmd)
call(cmd, shell=True)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Call Mason to simulate nanopore reads from amplicon genome "
"structures and background genome regions.")
parser.add_argument("--mason_path", help="Path to Mason executable.", required=True)
parser.add_argument("-o", "--output_prefix", help="Prefix for output filenames.", required=True)
parser.add_argument("--amplicon_fasta", help="Fasta file of simulated amplicon structure.", required=True)
parser.add_argument("--amplicon_coverage", type=float, help="Coverage for amplicon region - scale up to simulate "
"higher amplicon CNs", required=True)

parser.add_argument("-l", "--read_length", help="Read length", type=int, default=150)
parser.add_argument("--background_fasta", help="Fasta file of background genome regions. Do not provide if you want"
" reads from amplicon only")
parser.add_argument("--background_coverage", type=float, help="Coverage for background region - scale up to "
"simulate lower amplicon CNs")
parser.add_argument("-t", "--num_threads", type=int, help="Number of threads to use in all stages (default=1).",
default=1)
parser.add_argument("--seed", help="Manually seed the pseudo-random number generator for the simulation.")

# Note: this parameter is not used for now.
parser.add_argument("--amp_is_linear", action='store_true', help="Set if the simulated amplicon is linear instead"
" of circular.")
args = parser.parse_args()
if args.background_fasta and not args.background_coverage:
print("--background_coverage required if --background_fasta is provided!")
sys.exit(1)

circular_or_linear = "linear" if args.amp_is_linear else "circular"

# set the nanopore read simulation model
check_mason_path(args.mason_path)

# simulate reads from the amplicon
print("Simulating reads from amplicon...")
amplicon_prefix = args.output_prefix + "_amplicon_reads"
run_mason(
args.amplicon_fasta,
args.mason_path,
amplicon_prefix,
args.read_length,
args.amplicon_coverage,
circular_or_linear,
args.num_threads,
args.seed
)

# simulate reads from the background
if args.background_fasta:
print("\nSimulating reads from background...")
background_prefix = args.output_prefix + "_background_reads"
run_mason(
args.background_fasta,
args.mason_path,
amplicon_background_prefixrefix,
jluebeck marked this conversation as resolved.
Show resolved Hide resolved
args.read_length,
args.background_coverage,
"linear",
args.num_threads,
args.seed
)
27 changes: 5 additions & 22 deletions src/run_nanosim.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import os
import sys

from utilities import compute_number_of_reads_to_simulate, get_fasta_length

# this script will run the nanosim pipeline to simulate nanopore reads from ecDNA and background genome.
# it assumes nanosim is installed and its scripts are on the system path (true if installed by conda).

Expand All @@ -16,32 +18,13 @@ def extract_nanosim_model(model_path):
if not os.path.isdir(model_path):
print("Extracting model files...")
cmd = "tar -xzf {}".format(model_path + ".tar.gz")
print(cmd)
print(cmd)
call(cmd, shell=True)


def get_fasta_length(fasta):
tlen = 0
with open(fasta) as infile:
for line in infile:
if line.startswith(">"):
continue

tlen+=len(line.rstrip())

print("Fasta size is " + str(tlen) + "bp")
return tlen


def compute_number_of_reads_to_simulate(coverage, fasta_len, mean_rl):
nreads = int(ceil(coverage * fasta_len / mean_rl))
print("Number of reads to simulate for {}x coverage: {}".format(str(coverage), str(nreads)))
return nreads


def run_nanosim(fasta, model_pre, output_pre, coverage, circ_or_linear, read_length_tuple, nthreads, seed=None):
fasta_length = get_fasta_length(fasta)
num_reads = str(compute_number_of_reads_to_simulate(coverage, fasta_length, model_mean_read_length))
fasta_length = utilities.get_fasta_length(fasta)
num_reads = str(utilities.compute_number_of_reads_to_simulate(coverage, fasta_length, model_mean_read_length))

cmd = "simulator.py genome -rg {} -c {} -o {} -n {} -dna_type {} -t {} -b guppy --fastq".format(
fasta, model_pre, output_pre, num_reads, circ_or_linear, str(nthreads))
Expand Down
23 changes: 23 additions & 0 deletions src/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
File for basic utilities.
"""
from math import ceil


def get_fasta_length(fasta):
tlen = 0
with open(fasta) as infile:
for line in infile:
if line.startswith(">"):
continue

tlen+=len(line.rstrip())

print("Fasta size is " + str(tlen) + "bp")
return tlen


def compute_number_of_reads_to_simulate(coverage, fasta_len, mean_rl):
nreads = int(ceil(coverage * fasta_len / mean_rl))
print("Number of reads to simulate for {}x coverage: {}".format(str(coverage), str(nreads)))
return nreads