From d4e7d5a1233cc7ed579c4d601498864c469f6c42 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Tue, 30 Jul 2024 14:33:48 -0700 Subject: [PATCH 1/5] bumped rfam mem and made variable --- rfam.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rfam.wdl b/rfam.wdl index cbd4057..d1499c5 100755 --- a/rfam.wdl +++ b/rfam.wdl @@ -43,6 +43,7 @@ task run { String claninfo_tsv String feature_lookup_tsv Int threads + Int memory = 100 String container String rfam_version_file = "rfam_version.txt" } @@ -67,8 +68,9 @@ task run { runtime { time: "1:00:00" - memory: "86G" docker: container + cpu: threads + memory: "~{memory} GiB" } output { From 20f5409f38b9ca2def5f60d2a0a2c8d871bdbf03 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Tue, 30 Jul 2024 15:50:47 -0700 Subject: [PATCH 2/5] also fixed some warnings from jaws validate --- annotation_full.wdl | 14 ++----- functional-annotation.wdl | 30 +-------------- structural-annotation.wdl | 80 +++++++++++++++++++-------------------- 3 files changed, 44 insertions(+), 80 deletions(-) diff --git a/annotation_full.wdl b/annotation_full.wdl index c763075..4e37bc1 100644 --- a/annotation_full.wdl +++ b/annotation_full.wdl @@ -6,11 +6,11 @@ import "./functional-annotation.wdl" as fa workflow annotation { input { String proj - String input_file + File input_file String imgap_project_id String database_location="/refdata/img/" String imgap_project_type="metagenome" - String? gm_license="/refdata/licenses/.gmhmmp2_key" + String gm_license="/refdata/licenses/.gmhmmp2_key" Int additional_threads=16 String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71" @@ -148,9 +148,7 @@ input { call finish_ano { input: container=container, - input_file=make_map_file.out_fasta, proj=proj, - start=stage.start, ano_info_file=make_info_file.imgap_info, proteins_faa = merge_outputs.proteins_faa, structural_gff = merge_outputs.structural_gff, @@ -235,7 +233,7 @@ task stage { input { String container String target="input.fasta" - String input_file + File input_file } command <<< @@ -300,7 +298,7 @@ task make_map_file { task split { input { File infile - String blocksize=100 + Int blocksize=100 String zfile="zscore.txt" String cmzfile="cmzscore.txt" String container @@ -504,8 +502,6 @@ task make_info_file { String fa_version_file = "fa_tool_version.txt" String fa_db_version_file = "fa_db_version.txt" String rfam_version_file = "rfam_version.txt" - String sa_version_file = "sa_tool_version.txt" - String sa_db_version_file = "sa_db_version.txt" } command <<< set -euo pipefail @@ -618,9 +614,7 @@ task finish_ano { String container String proj String prefix=sub(proj, ":", "_") - String start File ano_info_file - File input_file File proteins_faa File structural_gff File functional_gff diff --git a/functional-annotation.wdl b/functional-annotation.wdl index 9edcfa5..1d392ff 100755 --- a/functional-annotation.wdl +++ b/functional-annotation.wdl @@ -15,15 +15,11 @@ workflow f_annotate { Int? par_hmm_inst Int? approx_num_proteins String smart_db="~{database_location}"+"/SMART/01_06_2016/SMART.hmm" - String hmmsearch_bin="/opt/omics/bin/hmmsearch" - String frag_hits_filter_bin="/opt/omics/bin/functional_annotation/hmmsearch_fragmented_hits_filter.py" String cog_db="~{database_location}"+"/COG/HMMs/2003/COG.hmm" String tigrfam_db="~{database_location}"+"/TIGRFAM/v15.0/TIGRFAM.hmm" - String hit_selector_bin="/opt/omics/bin/functional_annotation/hmmsearch_hit_selector.py" String superfam_db="~{database_location}"+"/SuperFamily/v1.75/supfam.hmm" String pfam_db="~{database_location}"+"/Pfam/Pfam-A/v34.0/Pfam-A.v34.0.hmm" String pfam_claninfo_tsv="~{database_location}"+"/Pfam/Pfam-A/v34.0/Pfam-A.clans.tsv" - String pfam_clan_filter="/opt/omics/bin/functional_annotation/pfam_clan_filter.py" String cath_funfam_db="~{database_location}"+"/Cath-FunFam/v4.2.0/funfam.hmm" # Boolean signalp_execute=true # String signalp_gram_stain="GRAM_STAIN" @@ -61,8 +57,6 @@ workflow f_annotate { par_hmm_inst = par_hmm_inst, approx_num_proteins = approx_num_proteins, smart_db = smart_db, - hmmsearch = hmmsearch_bin, - frag_hits_filter = frag_hits_filter_bin, container=hmm_container } @@ -74,8 +68,6 @@ workflow f_annotate { par_hmm_inst = par_hmm_inst, approx_num_proteins = approx_num_proteins, cog_db = cog_db, - hmmsearch = hmmsearch_bin, - frag_hits_filter = frag_hits_filter_bin, container=hmm_container } @@ -87,8 +79,6 @@ workflow f_annotate { par_hmm_inst = par_hmm_inst, approx_num_proteins = approx_num_proteins, tigrfam_db = tigrfam_db, - hmmsearch = hmmsearch_bin, - hit_selector = hit_selector_bin, container=hmm_container } @@ -100,8 +90,6 @@ workflow f_annotate { par_hmm_inst = par_hmm_inst, approx_num_proteins = approx_num_proteins, superfam_db = superfam_db, - hmmsearch = hmmsearch_bin, - frag_hits_filter = frag_hits_filter_bin, container=hmm_container } @@ -114,8 +102,6 @@ workflow f_annotate { approx_num_proteins = approx_num_proteins, pfam_db = pfam_db, pfam_claninfo_tsv = pfam_claninfo_tsv, - pfam_clan_filter = pfam_clan_filter, - hmmsearch = hmmsearch_bin, container=hmm_container } @@ -127,8 +113,6 @@ workflow f_annotate { par_hmm_inst = par_hmm_inst, approx_num_proteins = approx_num_proteins, cath_funfam_db = cath_funfam_db, - hmmsearch = hmmsearch_bin, - frag_hits_filter = frag_hits_filter_bin, container=hmm_container } @@ -256,8 +240,6 @@ task smart { Float min_domain_eval_cutoff = 0.01 Float aln_length_ratio = 0.7 Float max_overlap_ratio = 0.1 - String hmmsearch - String frag_hits_filter String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -304,8 +286,6 @@ task cog { Float min_domain_eval_cutoff = 0.01 Float aln_length_ratio = 0.7 Float max_overlap_ratio = 0.1 - String hmmsearch - String frag_hits_filter String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -351,8 +331,6 @@ task tigrfam { Int approx_num_proteins = 0 Float aln_length_ratio = 0.7 Float max_overlap_ratio = 0.1 - String hmmsearch - String hit_selector String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -400,8 +378,6 @@ task superfam { Float min_domain_eval_cutoff = 0.01 Float aln_length_ratio = 0.7 Float max_overlap_ratio = 0.1 - String hmmsearch - String frag_hits_filter String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -447,8 +423,6 @@ task pfam { Int threads = 62 Int par_hmm_inst = 15 Int approx_num_proteins = 0 - String hmmsearch - String pfam_clan_filter String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -495,8 +469,6 @@ task cath_funfam { Float min_domain_eval_cutoff = 0.01 Float aln_length_ratio = 0.7 Float max_overlap_ratio = 0.1 - String hmmsearch - String frag_hits_filter String base=basename(input_fasta) String container String hmmsearch_version_file = "hmmsearch_version.txt" @@ -529,7 +501,7 @@ task cath_funfam { } } -task signalp { +task run_signalp { input { String project_id String prefix=sub(project_id, ":", "_") diff --git a/structural-annotation.wdl b/structural-annotation.wdl index 207ccc2..4435088 100755 --- a/structural-annotation.wdl +++ b/structural-annotation.wdl @@ -27,7 +27,7 @@ workflow s_annotate { # } - call trnascan.trnascan { + call trnascan.trnascan as ts{ input: imgap_input_fasta = imgap_input_fasta, imgap_project_id = imgap_project_id, @@ -36,7 +36,7 @@ workflow s_annotate { } - call rfam.rfam { + call rfam.rfam as rf{ input: cmzscore = cmzscore, imgap_input_fasta = imgap_input_fasta, @@ -47,7 +47,7 @@ workflow s_annotate { } - call crt.crt { + call crt.crt as ct { input: imgap_input_fasta = imgap_input_fasta, imgap_project_id = imgap_project_id, @@ -56,7 +56,7 @@ workflow s_annotate { - call cds_prediction.cds_prediction { + call cds_prediction.cds_prediction as cds{ input: imgap_input_fasta = imgap_input_fasta, imgap_project_id = imgap_project_id, @@ -72,10 +72,10 @@ workflow s_annotate { input: input_fasta = imgap_input_fasta, project_id = imgap_project_id, - rfam_gff = rfam.rfam_gff, - trna_gff = trnascan.gff, - crt_gff = crt.gff, - cds_gff = cds_prediction.gff, + rfam_gff = rf.rfam_gff, + trna_gff = ts.gff, + crt_gff = ct.gff, + cds_gff = cds.gff, container = container } @@ -84,8 +84,8 @@ workflow s_annotate { # input_fasta = imgap_input_fasta, project_id = imgap_project_id, final_gff = gff_merge.final_gff, - cds_genes = cds_prediction.genes, - cds_proteins = cds_prediction.proteins, + cds_genes = cds.genes, + cds_proteins = cds.proteins, container = container } @@ -94,7 +94,6 @@ workflow s_annotate { call gff_and_fasta_stats { input: input_fasta = imgap_input_fasta, - project_id = imgap_project_id, final_gff = gff_merge.final_gff, container = container } @@ -104,24 +103,24 @@ workflow s_annotate { output { File gff = gff_merge.final_gff - File crt_gff = crt.gff - File crisprs = crt.crisprs - File crt_out = crt.crt_out - File genemark_gff = cds_prediction.genemark_gff - File genemark_genes = cds_prediction.genemark_genes - File genemark_proteins = cds_prediction.genemark_proteins - File prodigal_gff = cds_prediction.prodigal_gff - File prodigal_genes = cds_prediction.prodigal_genes - File prodigal_proteins = cds_prediction.prodigal_proteins - File cds_gff = cds_prediction.gff - File cds_proteins = cds_prediction.proteins - File cds_genes = cds_prediction.genes - File trna_gff = trnascan.gff - File trna_bacterial_out = trnascan.bacterial_out - File trna_archaeal_out = trnascan.archaeal_out - File rfam_gff = rfam.rfam_gff - File rfam_tbl = rfam.rfam_tbl - String rfam_version = rfam.rfam_version + File crt_gff = ct.gff + File crisprs = ct.crisprs + File crt_out = ct.crt_out + File genemark_gff = cds.genemark_gff + File genemark_genes = cds.genemark_genes + File genemark_proteins = cds.genemark_proteins + File prodigal_gff = cds.prodigal_gff + File prodigal_genes = cds.prodigal_genes + File prodigal_proteins = cds.prodigal_proteins + File cds_gff = cds.gff + File cds_proteins = cds.proteins + File cds_genes = cds.genes + File trna_gff = ts.gff + File trna_bacterial_out = ts.bacterial_out + File trna_archaeal_out = ts.archaeal_out + File rfam_gff = rf.rfam_gff + File rfam_tbl = rf.rfam_tbl + String rfam_version = rf.rfam_version File proteins = fasta_merge.final_proteins File genes = fasta_merge.final_genes } @@ -138,28 +137,28 @@ task pre_qc { Int seqs_per_million_bp_cutoff = 500 Int min_seq_length = 150 String container - File tmp_fasta="~{input_fasta}.tmp" - File qced_fasta="~{prefix}_contigs.fna" + String tmp_fasta="~{input_fasta}.tmp" + String qced_fasta="~{prefix}_contigs.fna" } command <<< set -euo pipefail echo ~{tmp_fasta} grep -v '^\s*$' ~{input_fasta} | tr -d '\r' | \ - sed 's/^>[[:blank:]]*/>/g' > $tmp_fasta - acgt_count=`grep -v '^>' $tmp_fasta | grep -o [acgtACGT] | wc -l` - n_count=`grep -v '^>' $tmp_fasta | grep -o '[^acgtACGT]' | wc -l` + sed 's/^>[[:blank:]]*/>/g' > ~{tmp_fasta} + acgt_count=`grep -v '^>' ~{tmp_fasta} | grep -o [acgtACGT] | wc -l` + n_count=`grep -v '^>' ~{tmp_fasta} | grep -o '[^acgtACGT]' | wc -l` n_ratio=`echo ~n_count $acgt_count | awk '{printf "%f", $1 / $2}'` if (( $(echo "~n_ratio >= ~{n_ratio_cutoff}" | bc) )) then - rm $tmp_fasta + rm ~{tmp_fasta} exit 1 fi if [[ ~{project_type} == "isolate" ]] then - seq_count=`grep -c '^>' ~tmp_fasta` - bp_count=`grep -v '^>' ~tmp_fasta | tr -d '\n' | wc -m` + seq_count=`grep -c '^>' ~{tmp_fasta}` + bp_count=`grep -v '^>' ~{tmp_fasta} | tr -d '\n' | wc -m` seqs_per_million_bp=$seq_count if (( $bp_count > 1000000 )) then @@ -169,13 +168,13 @@ task pre_qc { fi if (( $(echo "~seqs_per_million_bp > ~{seqs_per_million_bp_cutoff}" | bc) )) then - rm $tmp_fasta + rm ~{tmp_fasta} exit 1 fi fi ~{bin} -v - ~{bin} ~tmp_fasta ~qced_fasta -l ~{min_seq_length} - rm ~tmp_fasta + ~{bin} ~{tmp_fasta} ~{qced_fasta} -l ~{min_seq_length} + rm ~{tmp_fasta} >>> runtime { @@ -265,7 +264,6 @@ task gff_and_fasta_stats { input { String bin="/opt/omics/bin/structural_annotation/gff_and_final_fasta_stats.py" File input_fasta - String project_id File final_gff String container } From 04d39de557744eeca600972cad6398c5a256b3f9 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Tue, 30 Jul 2024 15:59:01 -0700 Subject: [PATCH 3/5] brought additional memory parameter out to metaT level --- annotation_full.wdl | 4 +++- rfam.wdl | 4 +++- structural-annotation.wdl | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/annotation_full.wdl b/annotation_full.wdl index 4e37bc1..0395635 100644 --- a/annotation_full.wdl +++ b/annotation_full.wdl @@ -12,6 +12,7 @@ input { String imgap_project_type="metagenome" String gm_license="/refdata/licenses/.gmhmmp2_key" Int additional_threads=16 + Int additional_memory = 100 String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71" # structural annotation @@ -49,7 +50,8 @@ input { imgap_project_type = imgap_project_type, database_location = database_location, container=container, - gm_license=gm_license + gm_license=gm_license, + additional_memory = additional_memory } diff --git a/rfam.wdl b/rfam.wdl index d1499c5..54124f1 100755 --- a/rfam.wdl +++ b/rfam.wdl @@ -5,6 +5,7 @@ workflow rfam { File imgap_input_fasta String imgap_project_id Int additional_threads + Int additional_memory String database_location="/refdata/img/" String cm="~{database_location}"+"Rfam/13.0/Rfam.cm" String claninfo_tsv="~{database_location}"+"Rfam/13.0/Rfam.claninfo" @@ -21,7 +22,8 @@ workflow rfam { feature_lookup_tsv = feature_lookup_tsv, claninfo_tsv = claninfo_tsv, threads = additional_threads, - container=container + container=container, + memory = additional_memory } output { diff --git a/structural-annotation.wdl b/structural-annotation.wdl index 4435088..95982e4 100755 --- a/structural-annotation.wdl +++ b/structural-annotation.wdl @@ -12,6 +12,7 @@ workflow s_annotate { String imgap_project_id String imgap_project_type Int additional_threads + Int additional_memory Int? imgap_structural_annotation_translation_table String database_location String container @@ -43,7 +44,8 @@ workflow s_annotate { imgap_project_id = imgap_project_id, database_location = database_location, additional_threads = additional_threads, - container=container + container=container, + additional_memory = additional_memory } From ee2956f2c751b5eb2c154960995bfa962147db77 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Tue, 6 Aug 2024 09:59:55 -0700 Subject: [PATCH 4/5] version bump --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 0f1acbd..c641220 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v1.1.2 +v1.1.4 From 62fd2774211f1899010048e7fbf8135209873a68 Mon Sep 17 00:00:00 2001 From: Kaitlyn Jiayi Li Date: Thu, 8 Aug 2024 14:06:22 -0700 Subject: [PATCH 5/5] revert file to string --- annotation_full.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annotation_full.wdl b/annotation_full.wdl index 0395635..00a2f12 100644 --- a/annotation_full.wdl +++ b/annotation_full.wdl @@ -6,7 +6,7 @@ import "./functional-annotation.wdl" as fa workflow annotation { input { String proj - File input_file + String input_file String imgap_project_id String database_location="/refdata/img/" String imgap_project_type="metagenome" @@ -235,7 +235,7 @@ task stage { input { String container String target="input.fasta" - File input_file + String input_file } command <<<