Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

283 sync with nmdc metamags outputs #286

Merged
merged 13 commits into from
Aug 26, 2024
123 changes: 70 additions & 53 deletions data/workflow/WDL/metaG/mbin_nmdc.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,11 @@ workflow nmdc_mags {
String ko_file
String pfam_file
String tigrfam_file
String cath_funfam_file
String smart_file
String supfam_file
String crispr_file
String product_names_file
String gene_phylogeny_file
String lineage_file
String? map_file
File? domain_file
String? scratch_dir
Int cpu=32
Int threads=64
Expand All @@ -25,7 +22,7 @@ workflow nmdc_mags {
String checkm_db="/refdata/checkM_DB/checkm_data_2015_01_16"
String eukcc2_db="/refdata/EUKCC2_DB/eukcc2_db_ver_1.2"
String package_container = "microbiomedata/nmdc_mbin_vis:0.7.0"
String container = "microbiomedata/nmdc_mbin@sha256:57930406fb5cc364bacfc904066519de6cdc2d0ceda9db0eebf2336df3ef5349"
String container = "microbiomedata/nmdc_mbin@sha256:f3b154718474d2e21b53dbf4c1c35a1d3190eba3fe6b8f3fda105fcde22d9639"

call stage {
input:
Expand All @@ -39,19 +36,24 @@ workflow nmdc_mags {
ko_file=ko_file,
pfam_file=pfam_file,
tigrfam_file=tigrfam_file,
cath_funfam_file=cath_funfam_file,
smart_file=smart_file,
supfam_file=supfam_file,
crispr_file=crispr_file,
product_names_file=product_names_file,
gene_phylogeny_file=gene_phylogeny_file,
lineage_file=lineage_file,
map_file=map_file
}

call check_id_map {
input:
container=container,
contig_file=stage.contig,
proteins_file=stage.proteins
}

call mbin_nmdc {
input:
name=proj,
fna = stage.contig,
fna = check_id_map.contig,
aln = stage.sam,
gff = stage.gff,
lineage=stage.lineage_tsv,
Expand All @@ -74,22 +76,16 @@ workflow nmdc_mags {
ko_file=stage.ko,
pfam_file=stage.pfam,
tigrfam_file=stage.tigrfam,
cath_funfam_file=stage.cath_funfam,
smart_file=stage.smart,
supfam_file=stage.supfam,
crispr_file=stage.crispr,
gene_phylogeny_file=stage.gene_phylogeny,
product_names_file=stage.product_names,
container=package_container
}

call finish_mags {
input:
container="microbiomedata/workflowmeta:1.1.1",
contigs=stage.contig,
anno_gff=stage.gff,
sorted_bam=stage.sam,
proj=proj,
start=stage.start,
checkm = mbin_nmdc.checkm,
bacsum= mbin_nmdc.bacsum,
arcsum = mbin_nmdc.arcsum,
short = mbin_nmdc.short,
Expand All @@ -100,8 +96,6 @@ workflow nmdc_mags {
mbin_version = mbin_nmdc.mbin_version,
stats_json = package.stats_json,
stats_tsv = mbin_nmdc.stats_tsv,
hqmq_bin_fasta_files = mbin_nmdc.hqmq_bin_fasta_files,
bin_fasta_files = mbin_nmdc.lq_bin_fasta_files,
hqmq_bin_tarfiles = package.hqmq_bin_tarfiles,
lq_bin_tarfiles = package.lq_bin_tarfiles,
barplot = package.barplot,
Expand Down Expand Up @@ -227,9 +221,7 @@ task stage {
String ko_file
String pfam_file
String tigrfam_file
String cath_funfam_file
String smart_file
String supfam_file
String crispr_file
String product_names_file
String gene_phylogeny_file
String lineage_file
Expand All @@ -243,9 +235,7 @@ task stage {
String ko_out="ko.tsv"
String pfam_out="pfam.gff"
String tigrfam_out="tigrfam.gff"
String cath_funfam_out="cath_funfam.gff"
String smart_out="smart.gff"
String supfam_out="supfam.gff"
String crispr_out="crispr.tsv"
String products_out="products.tsv"
String gene_phylogeny_out="gene_phylogeny.tsv"
String lineage_out="lineage.tsv"
Expand Down Expand Up @@ -274,9 +264,7 @@ task stage {
stage ${ko_file} ${ko_out} &
stage ${pfam_file} ${pfam_out} &
stage ${tigrfam_file} ${tigrfam_out} &
stage ${cath_funfam_file} ${cath_funfam_out} &
stage ${smart_file} ${smart_out} &
stage ${supfam_file} ${supfam_out} &
stage ${crispr_file} ${crispr_out} &
stage ${product_names_file} ${products_out} &
stage ${gene_phylogeny_file} ${gene_phylogeny_out} &
stage ${lineage_file} ${lineage_out}
Expand All @@ -287,21 +275,19 @@ task stage {
>>>

output{
File contig = "contigs.fasta"
File sam = "pairedMapped_sorted.bam"
File gff = "functional_annotation.gff"
File proteins = "proteins.faa"
File cog = "cog.gff"
File ec = "ec.tsv"
File ko = "ko.tsv"
File pfam = "pfam.gff"
File tigrfam = "tigrfam.gff"
File cath_funfam = "cath_funfam.gff"
File smart = "smart.gff"
File supfam = "supfam.gff"
File product_names = "products.tsv"
File gene_phylogeny = "gene_phylogeny.tsv"
File lineage_tsv = "lineage.tsv"
File contig = contigs_out
File sam = bam_out
File gff = gff_out
File proteins = proteins_out
File cog = cog_out
File ec = ec_out
File ko = ko_out
File pfam = pfam_out
File tigrfam = tigrfam_out
File crispr = crispr_out
File product_names = products_out
File gene_phylogeny = gene_phylogeny_out
File lineage_tsv = lineage_out
File? map_tsv = map_out
String start = read_string("start.txt")
}
Expand All @@ -313,6 +299,44 @@ task stage {
}
}

task check_id_map{

String container
File contig_file
File proteins_file
String contig_file_name=basename(contig_file)

command<<<
set -euo pipefail

python <<CODE
import sys
contigIDs={}
with open("${contig_file}","r") as c_file:
for line in c_file:
if line.startswith(">"):
seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001
contigIDs[seq_id]=1
with open("${proteins_file}","r") as p_file:
for line in p_file:
if line.startswith(">"):
seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225
contig_id = "_".join(seq_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001
if contig_id not in contigIDs:
print(f"{contig_id} is not in ${contig_file_name}.", file=sys.stderr)
sys.exit(1)
CODE
>>>

output{
File contig = contig_file
}
runtime {
memory: "1 GiB"
cpu: 1
docker: container
}
}

task package{
String proj
Expand All @@ -326,9 +350,8 @@ task package{
File ko_file
File pfam_file
File tigrfam_file
File cath_funfam_file
File smart_file
File supfam_file
File crispr_file
File gene_phylogeny_file
File product_names_file
String container

Expand All @@ -337,7 +360,7 @@ task package{
create_tarfiles.py ${prefix} \
${json_stats} ${gff_file} ${proteins_file} ${cog_file} \
${ec_file} ${ko_file} ${pfam_file} ${tigrfam_file} \
${cath_funfam_file} ${smart_file} ${supfam_file} \
${crispr_file} ${gene_phylogeny_file} \
${product_names_file} \
${sep=" " bins}

Expand Down Expand Up @@ -372,22 +395,16 @@ task package{

task finish_mags {
String container
File contigs
File anno_gff
File sorted_bam
File mbin_sdb
File mbin_version
String proj
String prefix=sub(proj, ":", "_")
String start
File bacsum
File arcsum
File? short
File? low
File? unbinned
File? checkm
Array[File] hqmq_bin_fasta_files
Array[File] bin_fasta_files
Array[File] hqmq_bin_tarfiles
Array[File] lq_bin_tarfiles
File stats_json
Expand Down
4 changes: 2 additions & 2 deletions data/workflow/WDL/metaG/mbin_nmdc_output.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ workflow mbin_nmdc_output {
}

call make_output {
input: container="microbiomedata/nmdc_mbin_vis:0.4.0",
input: container="microbiomedata/nmdc_mbin_vis:0.7.0",
activity_json=generate_objects.activity_json,
object_json=generate_objects.data_object_json,
short=short,
Expand All @@ -60,7 +60,7 @@ workflow mbin_nmdc_output {

task pdf_to_png {
String? outdir
String container = "microbiomedata/nmdc_mbin_vis:0.4.0"
String container = "microbiomedata/nmdc_mbin_vis:0.7.0"
Array[File] pdf_files

command<<<
Expand Down
7 changes: 2 additions & 5 deletions data/workflow/templates/metaMAGs_inputs.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@
"main_workflow.<WORKFLOW>_ko_file":<KO_FILE>,
"main_workflow.<WORKFLOW>_pfam_file":<PFAM_FILE>,
"main_workflow.<WORKFLOW>_tigrfam_file":<TIGRFAM_FILE>,
"main_workflow.<WORKFLOW>_cath_funfam_file":<CATH_FUNFAM_FILE>,
"main_workflow.<WORKFLOW>_smart_file":<SMART_FILE>,
"main_workflow.<WORKFLOW>_supfam_file":<SUPFAM_FILE>,
"main_workflow.<WORKFLOW>_crispr_file":<CRISPR_FILE>,
"main_workflow.<WORKFLOW>_product_names_file":<PRODUCT_NAMES_FILE>,
"main_workflow.<WORKFLOW>_gene_phylogeny_file":<GENE_PHYLOGENY_FILE>,
"main_workflow.<WORKFLOW>_lineage_file":<LINEAGE_FILE>,
"main_workflow.<WORKFLOW>_domain_file":<DOMAIN_FILE>
"main_workflow.<WORKFLOW>_lineage_file":<LINEAGE_FILE>,
22 changes: 8 additions & 14 deletions data/workflow/templates/metaMAGs_wdl.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,24 @@
String <WORKFLOW>_ko_file
String <WORKFLOW>_pfam_file
String <WORKFLOW>_tigrfam_file
String <WORKFLOW>_cath_funfam_file
String <WORKFLOW>_smart_file
String <WORKFLOW>_supfam_file
String <WORKFLOW>_crispr_file
String <WORKFLOW>_product_names_file
String <WORKFLOW>_gene_phylogeny_file
String <WORKFLOW>_lineage_file
File? <WORKFLOW>_map_file
String? <WORKFLOW>_domain_file
Int <WORKFLOW>_cpu=16
Int <WORKFLOW>_threads=1
Int <WORKFLOW>_pthreads=1
String <WORKFLOW>_gtdbtk_db="/refdata/GTDBTK_DB/gtdbtk_release207_v2"
String <WORKFLOW>_checkm_db="/refdata/CheckM_DB/checkm_data_2015_01_16"
String <WORKFLOW>_eukcc2_db="/refdata/eukcc2_db_ver_1.2"
String metaMAGsVis_container = "microbiomedata/nmdc_mbin_vis:0.7.0"


call <WORKFLOW>.nmdc_mags as <ALIAS> {
input: proj=<WORKFLOW>_proj_name, contig_file=<WORKFLOW>_contig_file,
sam_file=<WORKFLOW>_sam_file, gff_file=<WORKFLOW>_gff_file, proteins_file=<WORKFLOW>_proteins_file,
cog_file=<WORKFLOW>_cog_file,ec_file=<WORKFLOW>_ec_file,ko_file=<WORKFLOW>_ko_file,pfam_file=<WORKFLOW>_pfam_file,
tigrfam_file=<WORKFLOW>_tigrfam_file,cath_funfam_file=<WORKFLOW>_cath_funfam_file,smart_file=<WORKFLOW>_smart_file,
supfam_file=<WORKFLOW>_supfam_file,product_names_file=<WORKFLOW>_product_names_file,
tigrfam_file=<WORKFLOW>_tigrfam_file,crispr_file=<WORKFLOW>_crispr_file,product_names_file=<WORKFLOW>_product_names_file,
gene_phylogeny_file=<WORKFLOW>_gene_phylogeny_file,lineage_file=<WORKFLOW>_lineage_file,
cpu=<WORKFLOW>_cpu, threads=<WORKFLOW>_threads,pthreads=<WORKFLOW>_pthreads,
gtdbtk_db=<WORKFLOW>_gtdbtk_db, checkm_db=<WORKFLOW>_checkm_db, scratch_dir=<WORKFLOW>_outdir,
Expand All @@ -41,7 +36,6 @@
call mbin_nmdc_output.pdf_to_png as metaMAGs_vis {
input:
outdir = <WORKFLOW>_outdir,
container = metaMAGsVis_container,
pdf_files = [<ALIAS>.barplot,<ALIAS>.heatmap]
}

Expand All @@ -50,14 +44,14 @@
File final_lq_bins_zip = <ALIAS>.final_lq_bins_zip
File final_gtdbtk_bac_summary = <ALIAS>.final_gtdbtk_bac_summary
File final_gtdbtk_ar_summary = <ALIAS>.final_gtdbtk_ar_summary
File short = <ALIAS>.final_short
File low = <ALIAS>.final_lowDepth_fa
File short = <ALIAS>.short
File low = <ALIAS>.low
File final_unbinned_fa = <ALIAS>.final_unbinned_fa
File final_checkm = <ALIAS>.final_checkm
File mags_version = <ALIAS>.final_version
File mags_version = <ALIAS>.mags_version
File final_stats_json = <ALIAS>.final_stats_json
File barplot = <ALIAS>.final_barplot
File heatmap = <ALIAS>.final_heatmap
File kronaplot = <ALIAS>.final_kronaplot
File barplot = <ALIAS>.barplot
File heatmap = <ALIAS>.heatmap
File kronaplot = <ALIAS>.kronaplot
}

17 changes: 3 additions & 14 deletions data/workflow/templates/metagenome_pipeline_wdl.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -202,22 +202,17 @@ workflow main_workflow {
File? metaMAGs_ko_file = metaAnnotation_call.ko_tsv
File? metaMAGs_pfam_file = metaAnnotation_call.pfam_gff
File? metaMAGs_tigrfam_file = metaAnnotation_call.tigrfam_gff
File? metaMAGs_cath_funfam_file = metaAnnotation_call.cath_funfam_gff
File? metaMAGs_smart_file = metaAnnotation_call.smart_gff
File? metaMAGs_supfam_file = metaAnnotation_call.supfam_gff
File? metaMAGs_crispr_file = metaAnnotation_call.crt_crisprs
File? metaMAGs_product_names_file = metaAnnotation_call.product_names_tsv
File? metaMAGs_gene_phylogeny_file = metaAnnotation_call.gene_phylogeny_tsv
File? metaMAGs_lineage_file = metaAnnotation_call.lineage_tsv
File? metaMAGs_map_file
File? metaMAGs_domain_file
Int metaMAGs_cpu=16
Int metaMAGs_threads=64
Int metaMAGs_pthreads=1
String metaMAGs_database="/refdata/GTDBTK_DB/gtdbtk_release207_v2"
String checkm_db="/refdata/CheckM_DB/checkm_data_2015_01_16"
String eukcc2_db="/refdata/EUKCC2_DB/eukcc2_db_ver_1.2"
String metaMAGsVis_container = "microbiomedata/nmdc_mbin_vis:0.4.0"
String metaMAGs_container = "microbiomedata/nmdc_mbin@sha256:57930406fb5cc364bacfc904066519de6cdc2d0ceda9db0eebf2336df3ef5349"


call metaMAGs.nmdc_mags as metaMAGs_call {
Expand All @@ -232,28 +227,22 @@ workflow main_workflow {
ko_file=metaMAGs_ko_file,
pfam_file=metaMAGs_pfam_file,
tigrfam_file=metaMAGs_tigrfam_file,
cath_funfam_file=metaMAGs_cath_funfam_file,
smart_file=metaMAGs_smart_file,
supfam_file=metaMAGs_supfam_file,
crispr_file=metaMAGs_crispr_file,
product_names_file=metaMAGs_product_names_file,
gene_phylogeny_file=metaMAGs_gene_phylogeny_file,
lineage_file=metaMAGs_lineage_file,
map_file=metaMAGs_map_file,
domain_file=metaMAGs_domain_file,
cpu=metaMAGs_cpu,
threads=metaMAGs_threads,
pthreads=metaMAGs_pthreads,
gtdbtk_db=metaMAGs_database,
checkm_db=checkm_db,
eukcc2_db=eukcc2_db,
scratch_dir=metaMAGs_outdir,
package_container=metaMAGsVis_container,
container=metaMAGs_container
scratch_dir=metaMAGs_outdir
}
call metaMAGsOutput.pdf_to_png as metaMAGs_vis {
input:
outdir = metaMAGs_outdir,
container = metaMAGsVis_container,
pdf_files = [metaMAGs_call.barplot,metaMAGs_call.heatmap]
}

Expand Down
Loading