From 51cfd301faaa2af7e6e2189f0695b2f44a8982ab Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Wed, 24 Apr 2024 06:14:02 +0000 Subject: [PATCH] fix: batch checks read type correctly --- aviary/__init__.py | 24 +++++++++++++++++++ aviary/aviary.py | 35 ++++++++++----------------- aviary/modules/processor.py | 47 +++++++++++++++++++++---------------- 3 files changed, 63 insertions(+), 43 deletions(-) diff --git a/aviary/__init__.py b/aviary/__init__.py index 3e2f46a3..b201f874 100644 --- a/aviary/__init__.py +++ b/aviary/__init__.py @@ -1 +1,25 @@ __version__ = "0.9.0" + + +# CONSTANTS +LONG_READ_TYPES = ["ont", "ont_hq", "rs", "sq", "ccs", "hifi"] +MEDAKA_MODELS = [ + "r103_fast_g507", "r103_fast_snp_g507", "r103_fast_variant_g507", "r103_hac_g507", "r103_hac_snp_g507", + "r103_hac_variant_g507", "r103_min_high_g345", "r103_min_high_g360", "r103_prom_high_g360", "r103_prom_snp_g3210", + "r103_prom_variant_g3210", "r103_sup_g507", "r103_sup_snp_g507", "r103_sup_variant_g507", "r1041_e82_260bps_fast_g632", + "r1041_e82_260bps_fast_variant_g632", "r1041_e82_260bps_hac_g632", "r1041_e82_260bps_hac_variant_g632", "r1041_e82_260bps_sup_g632", + "r1041_e82_260bps_sup_variant_g632", "r1041_e82_400bps_fast_g615", "r1041_e82_400bps_fast_g632", + "r1041_e82_400bps_fast_variant_g615", "r1041_e82_400bps_fast_variant_g632", "r1041_e82_400bps_hac_g615", + "r1041_e82_400bps_hac_g632", "r1041_e82_400bps_hac_variant_g615", "r1041_e82_400bps_hac_variant_g632", "r1041_e82_400bps_sup_g615", + "r1041_e82_400bps_sup_variant_g615", "r104_e81_fast_g5015", "r104_e81_fast_variant_g5015", "r104_e81_hac_g5015", + "r104_e81_hac_variant_g5015", "r104_e81_sup_g5015", "r104_e81_sup_g610", "r104_e81_sup_variant_g610", "r10_min_high_g303", + "r10_min_high_g340", "r941_e81_fast_g514", "r941_e81_fast_variant_g514", "r941_e81_hac_g514", "r941_e81_hac_variant_g514", + "r941_e81_sup_g514", "r941_e81_sup_variant_g514", "r941_min_fast_g303", "r941_min_fast_g507", "r941_min_fast_snp_g507", + "r941_min_fast_variant_g507", "r941_min_hac_g507", "r941_min_hac_snp_g507", "r941_min_hac_variant_g507", "r941_min_high_g303", + "r941_min_high_g330", "r941_min_high_g340_rle", "r941_min_high_g344", "r941_min_high_g351", "r941_min_high_g360", "r941_min_sup_g507", + "r941_min_sup_snp_g507", "r941_min_sup_variant_g507", "r941_prom_fast_g303", "r941_prom_fast_g507", "r941_prom_fast_snp_g507", + "r941_prom_fast_variant_g507", "r941_prom_hac_g507", "r941_prom_hac_snp_g507", "r941_prom_hac_variant_g507", "r941_prom_high_g303", + "r941_prom_high_g330", "r941_prom_high_g344", "r941_prom_high_g360", "r941_prom_high_g4011", "r941_prom_snp_g303", "r941_prom_snp_g322", + "r941_prom_snp_g360", "r941_prom_sup_g507", "r941_prom_sup_snp_g507", "r941_prom_sup_variant_g507", "r941_prom_variant_g303", + "r941_prom_variant_g322", "r941_prom_variant_g360", "r941_sup_plant_g610", "r941_sup_plant_variant_g610" +] \ No newline at end of file diff --git a/aviary/aviary.py b/aviary/aviary.py index 0364ab62..f1b28676 100755 --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -19,7 +19,7 @@ ############################################################################### import aviary.config.config as Config from aviary.modules.processor import Processor, process_batch -from .__init__ import __version__ +from .__init__ import __version__, MEDAKA_MODELS, LONG_READ_TYPES __author__ = "Rhys Newell" __copyright__ = "Copyright 2022" __credits__ = ["Rhys Newell"] @@ -478,7 +478,7 @@ def main(): 'reads, "ont" for Oxford Nanopore and "ont_hq" for Oxford Nanopore high quality reads (Guppy5+ or Q20) \n', dest='longread_type', default="ont", - choices=["ont","ont_hq", "rs", "sq", "ccs", "hifi"], + choices=LONG_READ_TYPES, ) long_read_group.add_argument( @@ -486,26 +486,7 @@ def main(): help='Medaka model to use for polishing long reads. \n', dest='medaka_model', default="r941_min_hac_g507", - choices=[ - "r103_fast_g507", "r103_fast_snp_g507", "r103_fast_variant_g507", "r103_hac_g507", "r103_hac_snp_g507", - "r103_hac_variant_g507", "r103_min_high_g345", "r103_min_high_g360", "r103_prom_high_g360", "r103_prom_snp_g3210", - "r103_prom_variant_g3210", "r103_sup_g507", "r103_sup_snp_g507", "r103_sup_variant_g507", "r1041_e82_260bps_fast_g632", - "r1041_e82_260bps_fast_variant_g632", "r1041_e82_260bps_hac_g632", "r1041_e82_260bps_hac_variant_g632", "r1041_e82_260bps_sup_g632", - "r1041_e82_260bps_sup_variant_g632", "r1041_e82_400bps_fast_g615", "r1041_e82_400bps_fast_g632", - "r1041_e82_400bps_fast_variant_g615", "r1041_e82_400bps_fast_variant_g632", "r1041_e82_400bps_hac_g615", - "r1041_e82_400bps_hac_g632", "r1041_e82_400bps_hac_variant_g615", "r1041_e82_400bps_hac_variant_g632", "r1041_e82_400bps_sup_g615", - "r1041_e82_400bps_sup_variant_g615", "r104_e81_fast_g5015", "r104_e81_fast_variant_g5015", "r104_e81_hac_g5015", - "r104_e81_hac_variant_g5015", "r104_e81_sup_g5015", "r104_e81_sup_g610", "r104_e81_sup_variant_g610", "r10_min_high_g303", - "r10_min_high_g340", "r941_e81_fast_g514", "r941_e81_fast_variant_g514", "r941_e81_hac_g514", "r941_e81_hac_variant_g514", - "r941_e81_sup_g514", "r941_e81_sup_variant_g514", "r941_min_fast_g303", "r941_min_fast_g507", "r941_min_fast_snp_g507", - "r941_min_fast_variant_g507", "r941_min_hac_g507", "r941_min_hac_snp_g507", "r941_min_hac_variant_g507", "r941_min_high_g303", - "r941_min_high_g330", "r941_min_high_g340_rle", "r941_min_high_g344", "r941_min_high_g351", "r941_min_high_g360", "r941_min_sup_g507", - "r941_min_sup_snp_g507", "r941_min_sup_variant_g507", "r941_prom_fast_g303", "r941_prom_fast_g507", "r941_prom_fast_snp_g507", - "r941_prom_fast_variant_g507", "r941_prom_hac_g507", "r941_prom_hac_snp_g507", "r941_prom_hac_variant_g507", "r941_prom_high_g303", - "r941_prom_high_g330", "r941_prom_high_g344", "r941_prom_high_g360", "r941_prom_high_g4011", "r941_prom_snp_g303", "r941_prom_snp_g322", - "r941_prom_snp_g360", "r941_prom_sup_g507", "r941_prom_sup_snp_g507", "r941_prom_sup_variant_g507", "r941_prom_variant_g303", - "r941_prom_variant_g322", "r941_prom_variant_g360", "r941_sup_plant_g610", "r941_sup_plant_variant_g610" - ] + choices=MEDAKA_MODELS ) long_read_group.add_argument( @@ -1109,7 +1090,7 @@ def main(): type=str2bool, nargs='?', const=True, - default=True + default=False ) batch_options.add_argument( @@ -1136,6 +1117,14 @@ def main(): default='95' ) + batch_options.add_argument( + '--medaka-model', '--medaka_model', + help='Medaka model to use for polishing long reads. \n', + dest='medaka_model', + default="r941_min_hac_g507", + choices=MEDAKA_MODELS + ) + add_workflow_arg( batch_options, ['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'], diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index 57df8d40..b3911927 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -35,6 +35,7 @@ import logging import os import subprocess +import copy from pathlib import Path from glob import glob @@ -42,6 +43,7 @@ from snakemake import utils from snakemake.io import load_configfile from ruamel.yaml import YAML # used for yaml reading with comments +from aviary import LONG_READ_TYPES BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble'] @@ -555,36 +557,41 @@ def process_batch(args, prefix): s2 = check_batch_input(batch.iloc[i, 2], "none", split=True) l = check_batch_input(batch.iloc[i, 3], "none", split=True) l_type = check_batch_input(batch.iloc[i, 4], "ont", split=False) + if l_type not in LONG_READ_TYPES: + logging.error(f"Unknown long read type {l_type} specified.") + logging.error(f"Valid long read types: {LONG_READ_TYPES}") + sys.exit(1) assembly = check_batch_input(batch.iloc[i, 5], None, split=False) coassemble = check_batch_input(batch.iloc[i, 6], False, split=False) - + + new_args = copy.deepcopy(args) # update the value of args - args.output = f"{prefix}/{sample}" - runs.append(args.output) - args.pe1 = s1 - args.pe2 = s2 + new_args.output = f"{prefix}/{sample}" + runs.append(new_args.output) + new_args.pe1 = s1 + new_args.pe2 = s2 - args.longreads = l - args.longread_type = l_type - args.assembly = assembly - args.coassemble = coassemble + new_args.longreads = l + new_args.longread_type = l_type + new_args.assembly = assembly + new_args.coassemble = coassemble # ensure output folder exists - if not os.path.exists(args.output): - os.makedirs(args.output) + if not os.path.exists(new_args.output): + os.makedirs(new_args.output) # setup processor for this line - processor = Processor(args) + processor = Processor(new_args) processor.make_config() - processor.run_workflow(cores=int(args.n_cores), - dryrun=args.dryrun, - clean=args.clean, - conda_frontend=args.conda_frontend, - snakemake_args=args.cmds, - rerun_triggers=args.rerun_triggers, - profile=args.snakemake_profile, - cluster_retries=args.cluster_retries, + processor.run_workflow(cores=int(new_args.n_cores), + dryrun=new_args.dryrun, + clean=new_args.clean, + conda_frontend=new_args.conda_frontend, + snakemake_args=new_args.cmds, + rerun_triggers=new_args.rerun_triggers, + profile=new_args.snakemake_profile, + cluster_retries=new_args.cluster_retries, write_to_script=write_to_script) if args.cluster: