-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
- Loading branch information
1 parent
7046e51
commit be67190
Showing
3 changed files
with
356 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
200 changes: 200 additions & 0 deletions
200
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
version 1.0 | ||
|
||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN | ||
|
||
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE | ||
|
||
workflow ProcessOnInstrumentDemuxedChunk { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics." | ||
} | ||
|
||
parameter_meta { | ||
|
||
######### | ||
# inputs | ||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed" | ||
|
||
bam_SM_field: | ||
"value to place in the SM field of the resulting BAM header's @RG line" | ||
|
||
platform: | ||
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]" | ||
|
||
bam_descriptor: | ||
"a one-word description of the purpose of the BAM (e.g. 'rg_post_aln'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)" | ||
|
||
gcs_out_root_dir: | ||
"output files will be copied over there" | ||
|
||
cov_bed: | ||
"An optional BED file on which coverage will be collected (a mean value for each interval)" | ||
cov_bed_descriptor: | ||
"A short description of the BED provided for targeted coverage estimation; will be used in naming output files." | ||
|
||
fingerprint_store: | ||
"A GCS 'folder' holding fingerprint VCF files" | ||
sample_id_at_store: | ||
"The ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{sample_id_at_store}*.vcf(.gz?)" | ||
|
||
vbid2_config_json: | ||
"A config json to for running the VBID2 contamination estimation sub-workflow; if provided, will trigger the VBID2 sub-workflow for cross-(human)individual contamination estimation." | ||
|
||
expected_sex_type: | ||
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]" | ||
|
||
check_postaln_methyl_tags: | ||
"If true, will run a sub-workflow to collect methylation tags information." | ||
|
||
######### | ||
# outputs | ||
wgs_cov: | ||
"whole genome mean coverage" | ||
|
||
nanoplot_summ: | ||
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)" | ||
|
||
sam_flag_stats: | ||
"SAM flag stats" | ||
|
||
contamination_est: | ||
"cross-(human)individual contamination estimation by VerifyBAMID2" | ||
|
||
inferred_sex_info: | ||
"Inferred sex concordance information if expected sex type is provided" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
aBAM_metrics_files_out: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
File uBAM | ||
File? uPBI | ||
|
||
String readgroup_id | ||
|
||
String bam_SM_field | ||
|
||
String platform | ||
|
||
File ref_map_file | ||
|
||
String gcs_out_root_dir | ||
|
||
String disk_type = "SSD" | ||
|
||
# args for optional QC subworkflows | ||
String bam_descriptor | ||
String tech | ||
|
||
File? cov_bed | ||
String? cov_bed_descriptor | ||
|
||
String? fingerprint_store | ||
String? sample_id_at_store | ||
|
||
File? vbid2_config_json | ||
String? expected_sex_type | ||
Boolean check_postaln_methyl_tags = true | ||
} | ||
|
||
output { | ||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
File aligned_bam = FinalizeAlignedBam.gcs_path | ||
File aligned_bai = FinalizeAlignedBai.gcs_path | ||
File aligned_pbi = FinalizeAlignedPbi.gcs_path | ||
|
||
String movie = AlignHiFiUBAM.movie | ||
|
||
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs) | ||
Float wgs_cov = QCandMetrics.wgs_cov | ||
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ | ||
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats | ||
|
||
# fingerprint | ||
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check | ||
|
||
# contam | ||
Float? contamination_est = QCandMetrics.contamination_est | ||
|
||
# sex concordance | ||
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info | ||
|
||
# methyl | ||
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats | ||
|
||
# file-based QC/metrics outputs all packed into a finalization map | ||
Map[String, String] aBAM_metrics_files_out = QCandMetrics.aBAM_metrics_files_out | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunk" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
# String bc_specific_out = outdir + '/' + readgroup_id | ||
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id | ||
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id | ||
|
||
################################################################################### | ||
# align | ||
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input: | ||
uBAM = uBAM, | ||
uPBI = uPBI, | ||
bam_sample_name = bam_SM_field, | ||
ref_map_file = ref_map_file, | ||
application = 'HIFI' | ||
} | ||
|
||
File aBAM = AlignHiFiUBAM.aligned_bam | ||
File aBAI = AlignHiFiUBAM.aligned_bai | ||
File aPBI = AlignHiFiUBAM.aligned_pbi | ||
|
||
# save | ||
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' } | ||
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' } | ||
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' } | ||
|
||
################################################################################### | ||
# QC | ||
call QCMetrics.Work as QCandMetrics { input: | ||
bam = aBAM, | ||
bai = aBAI, | ||
|
||
bam_descriptor = bam_descriptor, | ||
tech = platform, | ||
|
||
cov_bed = cov_bed, | ||
cov_bed_descriptor = cov_bed_descriptor, | ||
|
||
fingerprint_store = fingerprint_store, | ||
sample_id_at_store = sample_id_at_store, | ||
|
||
vbid2_config_json = vbid2_config_json, | ||
expected_sex_type = expected_sex_type, | ||
check_postaln_methyl_tags = check_postaln_methyl_tags, | ||
|
||
ref_map_file = ref_map_file, | ||
disk_type = disk_type, | ||
|
||
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell | ||
gcs_out_root_dir = bc_specific_metrics_out, | ||
} | ||
|
||
################################################################################### | ||
call GU.GetTodayDate as today {} | ||
} |
150 changes: 150 additions & 0 deletions
150
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
version 1.0 | ||
|
||
|
||
import "../../../tasks/Utility/BAMutils.wdl" as BU | ||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0 | ||
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1 | ||
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2 | ||
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3 | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE | ||
|
||
workflow ProcessOnInstrumentDemuxedChunkRefFree { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics" | ||
} | ||
|
||
parameter_meta { | ||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>" | ||
|
||
short_reads_threshold: | ||
"a length threshold below which reads are classified as short" | ||
|
||
bam_descriptor: | ||
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)" | ||
|
||
hifi_stats: | ||
"A few metrics output by Nanoplot" | ||
hifi_fq_stats: | ||
"A few metrics output by seqkit stats" | ||
|
||
read_len_summaries: | ||
"A few metrics summarizing the read length distribution" | ||
read_len_peaks: | ||
"Peaks of the read length distribution (heruistic)" | ||
read_len_deciles: | ||
"Deciles of the read length distribution" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
uBAM_metrics_files_out: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
File uBAM | ||
String readgroup_id | ||
|
||
String bam_descriptor | ||
Int short_reads_threshold | ||
|
||
String disk_type = "SSD" | ||
|
||
String gcs_out_root_dir | ||
} | ||
|
||
output { | ||
String movie = movie_name | ||
|
||
File hifi_fq = FinalizeFQ.gcs_path | ||
|
||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
# todo merge these two together | ||
Map[String, Float] hifi_stats = NanoPlotFromUBam.stats_map | ||
Map[String, Float] hifi_fq_stats = FASTQstats.stats | ||
|
||
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries | ||
Array[Int] read_len_peaks = DystPeaker.read_len_peaks | ||
Array[Int] read_len_deciles = DystPeaker.read_len_deciles | ||
|
||
# methylation call rate stats | ||
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats | ||
|
||
# file-based outputs all packed into a finalization map | ||
Map[String, String] uBAM_metrics_files_out = FF.result | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23) | ||
String bc_specific_out = outdir_ref_free + '/' + readgroup_id | ||
String bc_specific_metrics_out = bc_specific_out + "/metrics" | ||
|
||
################################################################################### | ||
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']} | ||
String movie_name = RG.read_group_info['PU'] | ||
|
||
################################################################################### | ||
# convert to FASTQ (most for assembly) | ||
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"} | ||
call FF.FinalizeToFile as FinalizeFQ { input: | ||
outdir = bc_specific_out, | ||
file = BamToFastq.reads_fq, | ||
name = readgroup_id + '.hifi.fq.gz' | ||
} | ||
|
||
################################################################################### | ||
# more QCs and metrics | ||
########## | ||
call QC0.NanoPlotFromUBam {input: uBAM = uBAM} | ||
FinalizationManifestLine a = object | ||
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]), | ||
is_singleton_file: false, | ||
destination: bc_specific_metrics_out + "/nanoplot", | ||
output_attribute_name: "nanoplot"} | ||
########## | ||
call QC1.CountTheBeans as NoMissingBeans { input: | ||
bam=uBAM, | ||
bam_descriptor=bam_descriptor, | ||
gcs_out_root_dir=bc_specific_metrics_out, | ||
use_local_ssd=disk_type=='LOCAL' | ||
} | ||
Map[String, String] methyl_out = {"missing_methyl_tag_reads": | ||
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']} | ||
########## | ||
call QC2.DystPeaker { input: | ||
input_file=uBAM, input_is_bam=true, | ||
id=readgroup_id, short_reads_threshold=short_reads_threshold, | ||
gcs_out_root_dir=bc_specific_metrics_out | ||
} | ||
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist, | ||
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']} | ||
|
||
########## | ||
call QC3.FASTQstats { input: | ||
reads=BamToFastq.reads_fq, | ||
file_type='FASTQ' | ||
} | ||
|
||
################################################################################### | ||
call SAVE.SaveFilestoDestination as FF { input: | ||
instructions = select_all([a]), | ||
already_finalized = select_all([methyl_out, | ||
read_len_out]) | ||
} | ||
|
||
call GU.GetTodayDate as today {} | ||
} |