From aee52bf0fd1b4587b1de277b092fc58eefcbaab5 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Tue, 25 Jun 2024 07:36:23 -0700 Subject: [PATCH] removed most legacy conditionals --- annotation_full.wdl | 52 ++++------ functional-annotation.wdl | 9 +- prodigal.wdl | 213 -------------------------------------- structural-annotation.wdl | 59 ++--------- test-small.wdl | 3 +- 5 files changed, 29 insertions(+), 307 deletions(-) delete mode 100755 prodigal.wdl diff --git a/annotation_full.wdl b/annotation_full.wdl index d596215..9a94410 100644 --- a/annotation_full.wdl +++ b/annotation_full.wdl @@ -10,7 +10,7 @@ input { String imgap_project_id String database_location="/refdata/img/" String imgap_project_type="metagenome" - String gm_license="/refdata/licenses/.gmhmmp2_key" + String? gm_license="/refdata/licenses/.gmhmmp2_key" Int additional_threads=16 String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71" @@ -117,7 +117,6 @@ input { container=container, sa_execute = sa_execute, fa_execute = fa_execute, - map_execute = make_map_file.map_execute, map_info = make_map_file.out_log, structural_gff = merge_outputs.structural_gff, imgap_version = split.imgap_version, @@ -176,7 +175,6 @@ input { rfam_gff = merge_outputs.rfam_gff, product_names_tsv = merge_outputs.product_names_tsv, crt_crisprs = merge_outputs.crt_crisprs, - map_execute = make_map_file.map_execute, map_file = make_map_file.map_file, renamed_fasta = make_map_file.out_fasta } @@ -212,8 +210,8 @@ input { File product_names_tsv = finish_ano.final_product_names_tsv File crt_crisprs = finish_ano.final_crt_crisprs File imgap_version = finish_ano.final_version - File? renamed_fasta = finish_ano.final_renamed_fasta - File? map_file = finish_ano.final_map_file + File renamed_fasta = finish_ano.final_renamed_fasta + File map_file = finish_ano.final_map_file } parameter_meta { @@ -275,29 +273,21 @@ task make_map_file { } command <<< - find_prefix=`grep ~{proj_id} ~{input_file} | head -1` - - set -euo pipefail - if [[ $find_prefix ]] - then - echo "false" > run_map.txt - ln ~{input_file} ~{prefix}_map.fasta || ln -s ~{input_file} ~{prefix}_map.fasta - else - echo "true" > run_map.txt + set -euo pipefail + fasta_sanity.py -v fasta_sanity.py \ -p ~{proj_id} \ -l ~{min_seq_length} \ -u ~{unknown_gap_length} \ ~{input_file} ~{prefix}_map.fasta - fi + >>> output{ - File? map_file = "~{prefix}_contig_names_mapping.tsv" + File map_file = "~{prefix}_contig_names_mapping.tsv" File out_fasta = "~{prefix}_map.fasta" File out_log = stdout() - Boolean map_execute = read_boolean("run_map.txt") } runtime { memory: "120G" @@ -489,7 +479,6 @@ task make_info_file { input { String container String imgap_version - Boolean map_execute File map_info Boolean fa_execute Boolean sa_execute @@ -522,12 +511,11 @@ task make_info_file { set -euo pipefail echo "IMGAP Version: ~{imgap_version}" > ~{prefix}_imgap.info #get map script version - if [[ "~{map_execute}" = true ]] - then - map_version=`grep "fasta_sanity.py" ~{map_info}` - map_version="Mapping Programs Used: $map_version" - echo $map_version >> ~{prefix}_imgap.info - fi + + map_version=`grep "fasta_sanity.py" ~{map_info}` + map_version="Mapping Programs Used: $map_version" + echo $map_version >> ~{prefix}_imgap.info + #get structual annotation versions if [[ "~{sa_execute}" = true ]] then @@ -656,9 +644,8 @@ task finish_ano { File stats_json File product_names_tsv File crt_crisprs - Boolean map_execute - File? map_file - File? renamed_fasta + File map_file + File renamed_fasta String orig_prefix="scaffold" String sed="s/~{orig_prefix}_/~{proj}_/g" } @@ -695,12 +682,9 @@ task finish_ano { cat ~{stats_json} | sed ~{sed} > ~{prefix}_stats.json ln ~{ano_info_file} ~{prefix}_imgap.info || ln -s ~{ano_info_file} ~{prefix}_imgap.info + ln ~{map_file} ~{prefix}_contig_names_mapping.tsv || ln -s ~{map_file} ~{prefix}_contig_names_mapping.tsv + ln ~{renamed_fasta} ~{prefix}_contigs.fna || ln -s ~{renamed_fasta} ~{prefix}_contigs.fna - if [[ "~{map_execute}" = true ]] - then - ln ~{map_file} ~{prefix}_contig_names_mapping.tsv || ln -s ~{map_file} ~{prefix}_contig_names_mapping.tsv - ln ~{renamed_fasta} ~{prefix}_contigs.fna || ln -s ~{renamed_fasta} ~{prefix}_contigs.fna - fi >>> output { @@ -731,8 +715,8 @@ task finish_ano { File final_product_names_tsv = "~{prefix}_product_names.tsv" File final_lineage_tsv = "~{prefix}_scaffold_lineage.tsv" File final_crt_crisprs = "~{prefix}_crt.crisprs" - File? final_renamed_fasta = "~{prefix}_contigs.fna" - File? final_map_file = "~{prefix}_contig_names_mapping.tsv" + File final_renamed_fasta = "~{prefix}_contigs.fna" + File final_map_file = "~{prefix}_contig_names_mapping.tsv" File final_tsv = "~{prefix}_stats.tsv" File final_version = "~{prefix}_imgap.info" diff --git a/functional-annotation.wdl b/functional-annotation.wdl index d7ddba5..9edcfa5 100755 --- a/functional-annotation.wdl +++ b/functional-annotation.wdl @@ -6,31 +6,24 @@ workflow f_annotate { Int additional_threads # File input_contigs_fasta File input_fasta - String database_location - Boolean ko_ec_execute=true + String database_location="/refdata/img/" String ko_ec_img_nr_db="~{database_location}"+"/IMG-NR/20230629/img_nr" String ko_ec_md5_mapping="~{database_location}"+"/IMG-NR/20230629/md5Hash2Data.tsv" String ko_ec_taxon_to_phylo_mapping="~{database_location}"+"/IMG-NR/20230629/taxonOId2Taxonomy.tsv" String lastal_bin="/opt/omics/bin/lastal" String selector_bin="/opt/omics/bin/functional_annotation/lastal_img_nr_ko_ec_gene_phylo_hit_selector.py" - Boolean smart_execute=true Int? par_hmm_inst Int? approx_num_proteins String smart_db="~{database_location}"+"/SMART/01_06_2016/SMART.hmm" String hmmsearch_bin="/opt/omics/bin/hmmsearch" String frag_hits_filter_bin="/opt/omics/bin/functional_annotation/hmmsearch_fragmented_hits_filter.py" - Boolean cog_execute=true String cog_db="~{database_location}"+"/COG/HMMs/2003/COG.hmm" - Boolean tigrfam_execute=true String tigrfam_db="~{database_location}"+"/TIGRFAM/v15.0/TIGRFAM.hmm" String hit_selector_bin="/opt/omics/bin/functional_annotation/hmmsearch_hit_selector.py" - Boolean superfam_execute=true String superfam_db="~{database_location}"+"/SuperFamily/v1.75/supfam.hmm" - Boolean pfam_execute=true String pfam_db="~{database_location}"+"/Pfam/Pfam-A/v34.0/Pfam-A.v34.0.hmm" String pfam_claninfo_tsv="~{database_location}"+"/Pfam/Pfam-A/v34.0/Pfam-A.clans.tsv" String pfam_clan_filter="/opt/omics/bin/functional_annotation/pfam_clan_filter.py" - Boolean cath_funfam_execute=true String cath_funfam_db="~{database_location}"+"/Cath-FunFam/v4.2.0/funfam.hmm" # Boolean signalp_execute=true # String signalp_gram_stain="GRAM_STAIN" diff --git a/prodigal.wdl b/prodigal.wdl deleted file mode 100755 index 0ea51f3..0000000 --- a/prodigal.wdl +++ /dev/null @@ -1,213 +0,0 @@ -version 1.0 -workflow prodigal { - input { - String imgap_input_fasta - String imgap_project_id - String imgap_project_type - String container - } - if(imgap_project_type == "isolate") { - call fasta_len { - input: - input_fasta = imgap_input_fasta - } - } - if(imgap_project_type == "isolate" && fasta_len.wc >= 20000) { - call iso_big { - input: - input_fasta = imgap_input_fasta, - project_id = imgap_project_id, - container=container - } - } - if(imgap_project_type == "isolate" && fasta_len.wc < 20000) { - call iso_small { - input: - input_fasta = imgap_input_fasta, - project_id = imgap_project_id, - container=container - } - } - if(imgap_project_type == "metagenome") { - call metag { - input: - input_fasta = imgap_input_fasta, - project_id = imgap_project_id, - container=container - } - } - - call clean_and_unify { - input: - iso_big_proteins_fasta = iso_big.proteins, - iso_small_proteins_fasta = iso_small.proteins, - meta_proteins_fasta = metag.proteins, - iso_big_genes_fasta = iso_big.genes, - iso_small_genes_fasta = iso_small.genes, - meta_genes_fasta = metag.genes, - iso_big_gff = iso_big.gff, - iso_small_gff = iso_small.gff, - meta_gff = metag.gff, - project_id = imgap_project_id, - container=container - } - - output { - File gff = clean_and_unify.gff - File genes = clean_and_unify.genes - File proteins = clean_and_unify.proteins - } -} - -task fasta_len { - input { - File input_fasta - } - command { - grep -v '^>' ~{input_fasta} | wc -m - } - - runtime { - time: "1:00:00" - memory: "86G" - } - - output { - Int wc = select_first([read_int(stdout()),0]) - } -} - -task iso_big { - input { - String bin="/opt/omics/bin/prodigal" - File input_fasta - Int? translation_table = 11 - String project_id - String prefix=sub(project_id, ":", "_") - File train = "~{prefix}_prodigal.trn" - String container - } - command { - set -euo pipefail - ~{bin} -i ~{input_fasta} -t ~{train} -g ~{translation_table} -q - ~{bin} -f gff -g ~{translation_table} -p single -m -i ~{input_fasta} \ - -t ~{train} -o ~{prefix}_prodigal.gff \ - -d ~{prefix}_prodigal_genes.fna -a ~{prefix}_prodigal_proteins.faa - } - - runtime { - time: "1:00:00" - memory: "86G" - docker: container - } - - output { - File gff = "~{prefix}_prodigal.gff" - File genes = "~{prefix}_prodigal_genes.fna" - File proteins = "~{prefix}_prodigal_proteins.faa" - } -} - -task iso_small { - input { - String bin="/opt/omics/bin/prodigal" - File input_fasta - String project_id - String prefix=sub(project_id, ":", "_") - String container - } - command { - ~{bin} -f gff -p meta -m -i ~{input_fasta} \ - -o ~{prefix}_prodigal.gff -d ~{prefix}_prodigal_genes.fna \ - -a ~{prefix}_prodigal_proteins.faa - } - - runtime { - time: "1:00:00" - memory: "86G" - docker: container - } - - output { - File gff = "~{prefix}_prodigal.gff" - File genes = "~{prefix}_prodigal_genes.fna" - File proteins = "~{prefix}_prodigal_proteins.faa" - } -} - -task metag { - input { - String bin="/opt/omics/bin/prodigal" - File input_fasta - String project_id - String prefix=sub(project_id, ":", "_") - String container - } - - command <<< - set -eou pipefail - ~{bin} -f gff -p meta -m -i ~{input_fasta} \ - -o ~{prefix}_prodigal.gff -d ~{prefix}_prodigal_genes.fna \ - -a ~{prefix}_prodigal_proteins.faa - >>> - - runtime { - time: "1:00:00" - memory: "86G" - docker: container - } - - output { - File gff = "~{prefix}_prodigal.gff" - File genes = "~{prefix}_prodigal_genes.fna" - File proteins = "~{prefix}_prodigal_proteins.faa" - } -} - -task clean_and_unify { - input { - File? iso_big_proteins_fasta - File? iso_small_proteins_fasta - File? meta_proteins_fasta - File? iso_big_genes_fasta - File? iso_small_genes_fasta - File? meta_genes_fasta - File? iso_big_gff - File? iso_small_gff - File? meta_gff - String unify_bin="/opt/omics/bin/structural_annotation/unify_gene_ids.py" - String project_id - String prefix=sub(project_id, ":", "_") - String container - } - command <<< - set -eou pipefail - sed -i 's/\*$//g' ~{iso_big_proteins_fasta} ~{iso_small_proteins_fasta} ~{meta_proteins_fasta} - sed -i 's/\*/X/g' ~{iso_big_proteins_fasta} ~{iso_small_proteins_fasta} ~{meta_proteins_fasta} - ~{unify_bin} ~{iso_big_gff} ~{iso_small_gff} ~{meta_gff} \ - ~{iso_big_genes_fasta} ~{iso_small_genes_fasta} ~{meta_genes_fasta} \ - ~{iso_big_proteins_fasta} ~{iso_small_proteins_fasta} ~{meta_proteins_fasta} - mv ~{iso_big_proteins_fasta} . 2> /dev/null - mv ~{iso_small_proteins_fasta} . 2> /dev/null - mv ~{meta_proteins_fasta} . 2> /dev/null - mv ~{iso_big_genes_fasta} . 2> /dev/null - mv ~{iso_small_genes_fasta} . 2> /dev/null - mv ~{meta_genes_fasta} . 2> /dev/null - mv ~{iso_big_gff} . 2> /dev/null - mv ~{iso_small_gff} . 2> /dev/null - mv ~{meta_gff} . 2> /dev/null - >>> - - runtime { - time: "1:00:00" - memory: "86G" - docker: container - } - - output { - File gff = "~{prefix}_prodigal.gff" - File genes = "~{prefix}_prodigal_genes.fna" - File proteins = "~{prefix}_prodigal_proteins.faa" - } -} - diff --git a/structural-annotation.wdl b/structural-annotation.wdl index 2d113b0..207ccc2 100755 --- a/structural-annotation.wdl +++ b/structural-annotation.wdl @@ -13,14 +13,6 @@ workflow s_annotate { String imgap_project_type Int additional_threads Int? imgap_structural_annotation_translation_table - Boolean pre_qc_execute=false - Boolean trnascan_se_execute=true - Boolean rfam_execute=true - Boolean crt_execute=true - Boolean cds_prediction_execute=true - Boolean prodigal_execute=true - Boolean genemark_execute=true - Boolean gff_and_fasta_stats_execute=true String database_location String container String gm_license="/refdata/licenses/.gmhmmp2_key" @@ -69,8 +61,6 @@ workflow s_annotate { imgap_input_fasta = imgap_input_fasta, imgap_project_id = imgap_project_id, imgap_project_type = imgap_project_type, - prodigal_execute = prodigal_execute, - genemark_execute = genemark_execute, imgap_structural_annotation_translation_table = imgap_structural_annotation_translation_table, container = container, gm_license = gm_license @@ -86,11 +76,6 @@ workflow s_annotate { trna_gff = trnascan.gff, crt_gff = crt.gff, cds_gff = cds_prediction.gff, - prodigal_execute = prodigal_execute, - genemark_execute = genemark_execute, - crt_execute = crt_execute, - rfam_execute = rfam_execute, - trnascan_se_execute = trnascan_se_execute, container = container } @@ -216,47 +201,19 @@ task gff_merge { File rfam_gff File crt_gff File cds_gff - Boolean prodigal_execute - Boolean genemark_execute - Boolean crt_execute - Boolean rfam_execute - Boolean trnascan_se_execute String container } command <<< set -euo pipefail - # set cromwell booleans as bash variables - prodigal_execute=~{prodigal_execute} - genemark_execute=~{genemark_execute} - crt_execute=~{crt_execute} - rfam_execute=~{rfam_execute} - trnascan_se_execute=~{trnascan_se_execute} - - #construct arguments for gff_files_merger.py - merger_args="--contigs_fasta ~{input_fasta}" - - if [[ "$prodigal_execute" = true ]] || [[ "$genemark_execute" = true ]] ; then - merger_args="$merger_args --cds_gff ~{cds_gff}" - fi - - if [[ "$crt_execute" = true ]] ; then - merger_args="$merger_args --crt_gff ~{crt_gff}" - fi - - if [[ ("$prodigal_execute" = true || "$genemark_execute" = true) ]] && [[ "$crt_execute" = true ]] ; then - merger_args="$merger_args --log_file ~{prefix}_gff_merge.log" - fi - - if [[ "$rfam_execute" = true ]] ; then - merger_args="$merger_args ~{rfam_gff}" - fi - - if [[ "$trnascan_se_execute" = true ]] ; then - merger_args="$merger_args ~{trna_gff}" - fi - #excute gff_files_merger.py - ~{bin} $merger_args 1> ~{prefix}_structural_annotation.gff + ~{bin} \ + --contigs_fasta ~{input_fasta} \ + --cds_gff ~{cds_gff} \ + --crt_gff ~{crt_gff} \ + --log_file ~{prefix}_gff_merge.log \ + ~{rfam_gff} \ + ~{trna_gff} \ + 1> ~{prefix}_structural_annotation.gff >>> diff --git a/test-small.wdl b/test-small.wdl index fec4f7b..1ad0b1a 100755 --- a/test-small.wdl +++ b/test-small.wdl @@ -42,6 +42,7 @@ task prepare { String url } command <<< + set -eou pipefail wget ~{url}/~{prefix}_contigs.fna >>> @@ -68,7 +69,7 @@ task validate { } command <<< - set -e + set -eou pipefail wget ~{url}/~{prefix}_functional_annotation.gff wget ~{url}/~{prefix}_structural_annotation.gff validate.sh ~{func_gff}