Skip to content

Commit

Permalink
Two new workflows:
Browse files Browse the repository at this point in the history
    handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
  • Loading branch information
SHuang-Broad committed Dec 28, 2023
1 parent 7046e51 commit be67190
Show file tree
Hide file tree
Showing 3 changed files with 356 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ workflows:
- name: PBMASIsoSeqDemultiplex
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
- name: ProcessOnInstrumentDemuxedChunkRefFree
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
- name: ProcessOnInstrumentDemuxedChunk
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl

###################################################
# TechAgnostic - *mics data processing
Expand Down
200 changes: 200 additions & 0 deletions wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
version 1.0

import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN

import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics

import "../../../tasks/Utility/Finalize.wdl" as FF
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE

workflow ProcessOnInstrumentDemuxedChunk {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
}

parameter_meta {

#########
# inputs
readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"

bam_SM_field:
"value to place in the SM field of the resulting BAM header's @RG line"

platform:
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"

bam_descriptor:
"a one-word description of the purpose of the BAM (e.g. 'rg_post_aln'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"

gcs_out_root_dir:
"output files will be copied over there"

cov_bed:
"An optional BED file on which coverage will be collected (a mean value for each interval)"
cov_bed_descriptor:
"A short description of the BED provided for targeted coverage estimation; will be used in naming output files."

fingerprint_store:
"A GCS 'folder' holding fingerprint VCF files"
sample_id_at_store:
"The ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{sample_id_at_store}*.vcf(.gz?)"

vbid2_config_json:
"A config json to for running the VBID2 contamination estimation sub-workflow; if provided, will trigger the VBID2 sub-workflow for cross-(human)individual contamination estimation."

expected_sex_type:
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"

check_postaln_methyl_tags:
"If true, will run a sub-workflow to collect methylation tags information."

#########
# outputs
wgs_cov:
"whole genome mean coverage"

nanoplot_summ:
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"

sam_flag_stats:
"SAM flag stats"

contamination_est:
"cross-(human)individual contamination estimation by VerifyBAMID2"

inferred_sex_info:
"Inferred sex concordance information if expected sex type is provided"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

aBAM_metrics_files_out:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
File uBAM
File? uPBI

String readgroup_id

String bam_SM_field

String platform

File ref_map_file

String gcs_out_root_dir

String disk_type = "SSD"

# args for optional QC subworkflows
String bam_descriptor
String tech

File? cov_bed
String? cov_bed_descriptor

String? fingerprint_store
String? sample_id_at_store

File? vbid2_config_json
String? expected_sex_type
Boolean check_postaln_methyl_tags = true
}

output {
String last_processing_date = today.yyyy_mm_dd

File aligned_bam = FinalizeAlignedBam.gcs_path
File aligned_bai = FinalizeAlignedBai.gcs_path
File aligned_pbi = FinalizeAlignedPbi.gcs_path

String movie = AlignHiFiUBAM.movie

# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
Float wgs_cov = QCandMetrics.wgs_cov
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats

# fingerprint
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check

# contam
Float? contamination_est = QCandMetrics.contamination_est

# sex concordance
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info

# methyl
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats

# file-based QC/metrics outputs all packed into a finalization map
Map[String, String] aBAM_metrics_files_out = QCandMetrics.aBAM_metrics_files_out
}

###################################################################################
# prep work
# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

# String bc_specific_out = outdir + '/' + readgroup_id
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id

###################################################################################
# align
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
uBAM = uBAM,
uPBI = uPBI,
bam_sample_name = bam_SM_field,
ref_map_file = ref_map_file,
application = 'HIFI'
}

File aBAM = AlignHiFiUBAM.aligned_bam
File aBAI = AlignHiFiUBAM.aligned_bai
File aPBI = AlignHiFiUBAM.aligned_pbi

# save
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }

###################################################################################
# QC
call QCMetrics.Work as QCandMetrics { input:
bam = aBAM,
bai = aBAI,

bam_descriptor = bam_descriptor,
tech = platform,

cov_bed = cov_bed,
cov_bed_descriptor = cov_bed_descriptor,

fingerprint_store = fingerprint_store,
sample_id_at_store = sample_id_at_store,

vbid2_config_json = vbid2_config_json,
expected_sex_type = expected_sex_type,
check_postaln_methyl_tags = check_postaln_methyl_tags,

ref_map_file = ref_map_file,
disk_type = disk_type,

output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
gcs_out_root_dir = bc_specific_metrics_out,
}

###################################################################################
call GU.GetTodayDate as today {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
version 1.0


import "../../../tasks/Utility/BAMutils.wdl" as BU
import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3

import "../../../tasks/Utility/Finalize.wdl" as FF
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE

workflow ProcessOnInstrumentDemuxedChunkRefFree {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
}

parameter_meta {
readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"

short_reads_threshold:
"a length threshold below which reads are classified as short"

bam_descriptor:
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"

hifi_stats:
"A few metrics output by Nanoplot"
hifi_fq_stats:
"A few metrics output by seqkit stats"

read_len_summaries:
"A few metrics summarizing the read length distribution"
read_len_peaks:
"Peaks of the read length distribution (heruistic)"
read_len_deciles:
"Deciles of the read length distribution"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

uBAM_metrics_files_out:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
File uBAM
String readgroup_id

String bam_descriptor
Int short_reads_threshold

String disk_type = "SSD"

String gcs_out_root_dir
}

output {
String movie = movie_name

File hifi_fq = FinalizeFQ.gcs_path

String last_processing_date = today.yyyy_mm_dd

# todo merge these two together
Map[String, Float] hifi_stats = NanoPlotFromUBam.stats_map
Map[String, Float] hifi_fq_stats = FASTQstats.stats

Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
Array[Int] read_len_deciles = DystPeaker.read_len_deciles

# methylation call rate stats
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats

# file-based outputs all packed into a finalization map
Map[String, String] uBAM_metrics_files_out = FF.result
}

###################################################################################
# prep work
# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
String bc_specific_metrics_out = bc_specific_out + "/metrics"

###################################################################################
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']}
String movie_name = RG.read_group_info['PU']

###################################################################################
# convert to FASTQ (most for assembly)
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"}
call FF.FinalizeToFile as FinalizeFQ { input:
outdir = bc_specific_out,
file = BamToFastq.reads_fq,
name = readgroup_id + '.hifi.fq.gz'
}

###################################################################################
# more QCs and metrics
##########
call QC0.NanoPlotFromUBam {input: uBAM = uBAM}
FinalizationManifestLine a = object
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
is_singleton_file: false,
destination: bc_specific_metrics_out + "/nanoplot",
output_attribute_name: "nanoplot"}
##########
call QC1.CountTheBeans as NoMissingBeans { input:
bam=uBAM,
bam_descriptor=bam_descriptor,
gcs_out_root_dir=bc_specific_metrics_out,
use_local_ssd=disk_type=='LOCAL'
}
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
##########
call QC2.DystPeaker { input:
input_file=uBAM, input_is_bam=true,
id=readgroup_id, short_reads_threshold=short_reads_threshold,
gcs_out_root_dir=bc_specific_metrics_out
}
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}

##########
call QC3.FASTQstats { input:
reads=BamToFastq.reads_fq,
file_type='FASTQ'
}

###################################################################################
call SAVE.SaveFilestoDestination as FF { input:
instructions = select_all([a]),
already_finalized = select_all([methyl_out,
read_len_out])
}

call GU.GetTodayDate as today {}
}

0 comments on commit be67190

Please sign in to comment.