bring Funcotator changes to M2 NIO WDL

broadinstitute · Mar 1, 2019 · 342c15f · 342c15f
1 parent 80ab76b
commit 342c15f
Showing 1 changed file with 92 additions and 51 deletions.
diff --git a/scripts/mutect2_wdl/mutect2_nio.wdl b/scripts/mutect2_wdl/mutect2_nio.wdl
@@ -56,12 +56,17 @@
 ##
 ## Funcotator parameters (see Funcotator help for more details).
 ## funco_reference_version: "hg19" for hg19 or b37.  "hg38" for hg38.  Default: "hg19"
-## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
+## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file.  Default: "MAF"
+## funco_compress: (Only valid if funco_output_format == "VCF" )  If true, will compress the output of Funcotator.  If false, produces an uncompressed output file.  Default: false
+## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance).  If false, will not annotate with gnomAD.  Default: false
 ## funco_transcript_selection_mode: How to select transcripts in Funcotator.  ALL, CANONICAL, or BEST_EFFECT
+## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
 ## funco_data_sources_tar_gz:  Funcotator datasources tar gz file.  Bucket location is recommended when running on the cloud.
 ## funco_annotation_defaults:  Default values for annotations, when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
 ## funco_annotation_overrides:  Values for annotations, even when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
 ## funcotator_excluded_fields:  Annotations that should not appear in the output (VCF or MAF).  Specified as  <ANNOTATION>.  For example:  "ClinVar_ALLELEID"
+## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column).  If false, will annotate all variants in the input file.  Default: true
+## funcotator_extra_args: Any additional arguments to pass to Funcotator.  Default: ""
 ##
 ## Outputs :
 ## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
@@ -119,22 +124,29 @@ workflow Mutect2 {
     File? default_config_file
     String? oncotator_extra_args
 
-    # funcotator inputs
+    # Funcotator inputs
     Boolean? run_funcotator
     Boolean run_funcotator_or_default = select_first([run_funcotator, false])
     String? funco_reference_version
+    String? funco_output_format
+    Boolean? funco_compress
+    Boolean? funco_use_gnomad_AF
     File? funco_data_sources_tar_gz
     String? funco_transcript_selection_mode
     File? funco_transcript_selection_list
     Array[String]? funco_annotation_defaults
     Array[String]? funco_annotation_overrides
     Array[String]? funcotator_excluded_fields
+    Boolean? funco_filter_funcotations
     String? funcotator_extra_args
 
-    File? gatk_override
+    Boolean run_funcotator_or_default = select_first([run_funcotator, false])
+    String funco_default_output_format = "MAF"
+
 
     # runtime
     String gatk_docker
+    File? gatk_override
     String basic_bash_docker = "ubuntu:16.04"
     String? oncotator_docker
     String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"])
@@ -438,28 +450,33 @@ workflow Mutect2 {
     if (run_funcotator_or_default) {
         File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
         File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
-        call FuncotateMaf {
+        call Funcotate {
             input:
+                ref_fasta = ref_fasta,
                 input_vcf = funcotate_vcf_input,
                 input_vcf_idx = funcotate_vcf_input_index,
-                ref_fasta = ref_fasta,
                 reference_version = select_first([funco_reference_version, "hg19"]),
+                output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated",
+                output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format,
+                compress = if defined(funco_compress) then funco_compress else false,
+                use_gnomad = if defined(funco_use_gnomad_AF) then funco_use_gnomad_AF else false,
                 data_sources_tar_gz = funco_data_sources_tar_gz,
                 case_id = M2.tumor_sample[0],
                 control_id = M2.normal_sample[0],
+                sequencing_center = sequencing_center,
+                sequence_source = sequence_source,
                 transcript_selection_mode = funco_transcript_selection_mode,
                 transcript_selection_list = funco_transcript_selection_list,
                 annotation_defaults = funco_annotation_defaults,
                 annotation_overrides = funco_annotation_overrides,
+                funcotator_excluded_fields = funcotator_excluded_fields,
+                filter_funcotations = filter_funcotations_or_default,
+                extra_args = funcotator_extra_args,
                 gatk_docker = gatk_docker,
                 gatk_override = gatk_override,
-                filter_funcotations = filter_funcotations_or_default,
-                funcotator_excluded_fields = funcotator_excluded_fields,
-                sequencing_center = sequencing_center,
-                sequence_source = sequence_source,
-                disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad,
+                preemptible_attempts = preemptible_attempts,
                 max_retries = max_retries,
-                extra_args = funcotator_extra_args
+                disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad
         }
     }
 
@@ -469,7 +486,8 @@ workflow Mutect2 {
         File? contamination_table = CalculateContamination.contamination_table
 
         File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf
-        File? funcotated_maf = FuncotateMaf.funcotated_output
+        File? funcotated_file = Funcotate.funcotated_output_file
+        File? funcotated_file_index = Funcotate.funcotated_output_file_index
         File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics
         File? bamout = MergeBamOuts.merged_bam_out
         File? bamout_index = MergeBamOuts.merged_bam_out_index
@@ -1228,40 +1246,52 @@ task SumFloats {
     }
 }
 
-task FuncotateMaf {
-     # inputs
+task Funcotate {
+     # ==============
+     # Inputs
      String ref_fasta
      String input_vcf
      String input_vcf_idx
      String reference_version
-     String output_format = "MAF"
+     String output_file_base_name
+     String output_format
+     Boolean compress
+     Boolean use_gnomad
+     # This should be updated when a new version of the data sources is released
+     # TODO: Make this dynamically chosen in the command.
+     File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz"
+     String? control_id
+     String? case_id
      String? sequencing_center
      String? sequence_source
-     String case_id
-     String? control_id
-
-     File? data_sources_tar_gz
      String? transcript_selection_mode
      File? transcript_selection_list
      Array[String]? annotation_defaults
      Array[String]? annotation_overrides
      Array[String]? funcotator_excluded_fields
-     Boolean filter_funcotations
+     Boolean? filter_funcotations
      File? interval_list
 
      String? extra_args
 
      # ==============
      # Process input args:
+     String output_maf = output_file_base_name + ".maf"
+     String output_maf_index = output_maf + ".idx"
+     String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf"
+     String output_vcf_index = output_vcf +  if compress then ".tbi" else ".idx"
+     String output_file = if output_format == "MAF" then output_maf else output_vcf
+     String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_index
+     String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
      String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
      String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
-     String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else ""
+     String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else ""
      String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else ""
-     String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated"
-     # ==============
-
-     # runtime
+     String interval_list_arg = if defined(interval_list) then " -L " else ""
+     String extra_args_arg = select_first([extra_args, ""])
 
+     # ==============
+     # Runtime options:
      String gatk_docker
      File? gatk_override
      Int? mem
@@ -1272,56 +1302,66 @@ task FuncotateMaf {
 
      Boolean use_ssd = false
 
-     # This should be updated when a new version of the data sources is released
-     String default_datasources_version = "funcotator_dataSources.v1.4.20180615"
-
      # You may have to change the following two parameter values depending on the task requirements
      Int default_ram_mb = 3000
-     # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
+     # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).  Please see [TODO: Link from Jose] for examples.
      Int default_disk_space_gb = 100
 
      # Mem is in units of GB but our command and memory runtime values are in MB
      Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
      Int command_mem = machine_mem - 1000
 
+     String dollar = "$"
+
      command <<<
          set -e
          export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
 
-         DATA_SOURCES_TAR_GZ=${data_sources_tar_gz}
-         if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then
-             # We have to download the data sources:
-             echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ"
-             echo "Downloading default data sources..."
-             wget ftp://[email protected]/bundle/funcotator/${default_datasources_version}.tar.gz
-             tar -zxf ${default_datasources_version}.tar.gz
-             DATA_SOURCES_FOLDER=${default_datasources_version}
-         else
-             # Extract the tar.gz:
-             mkdir datasources_dir
-             tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
-             DATA_SOURCES_FOLDER="$PWD/datasources_dir"
+         # Extract our data sources:
+         echo "Extracting data sources zip file..."
+         mkdir datasources_dir
+         tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
+         DATA_SOURCES_FOLDER="$PWD/datasources_dir"
+
+         # Handle gnomAD:
+         if ${use_gnomad} ; then
+             echo "Enabling gnomAD..."
+             for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do
+                 if [[ -f ${dollar}{DATA_SOURCES_FOLDER}/${dollar}{potential_gnomad_gz} ]] ; then
+                     cd ${dollar}{DATA_SOURCES_FOLDER}
+                     tar -zvxf ${dollar}{potential_gnomad_gz}
+                     cd -
+                 else
+                     echo "ERROR: Cannot find gnomAD folder: ${dollar}{potential_gnomad_gz}" 1>&2
+                     false
+                 fi
+             done
          fi
 
+         # Run Funcotator:
          gatk --java-options "-Xmx${command_mem}m" Funcotator \
              --data-sources-path $DATA_SOURCES_FOLDER \
              --ref-version ${reference_version} \
              --output-file-format ${output_format} \
              -R ${ref_fasta} \
              -V ${input_vcf} \
-             -O ${final_output_filename} \
-             ${"-L " + interval_list} \
+             -O ${output_file} \
+             ${interval_list_arg} ${default="" interval_list} \
+             --annotation-default normal_barcode:${default="Unknown" control_id} \
+             --annotation-default tumor_barcode:${default="Unknown" case_id} \
+             --annotation-default Center:${default="Unknown" sequencing_center} \
+             --annotation-default source:${default="Unknown" sequence_source} \
              ${"--transcript-selection-mode " + transcript_selection_mode} \
-             ${"--transcript-list " + transcript_selection_list} \
-            --annotation-default normal_barcode:${control_id} \
-            --annotation-default tumor_barcode:${case_id} \
-            --annotation-default Center:${default="Unknown" sequencing_center} \
-            --annotation-default source:${default="Unknown" sequence_source} \
+             ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \
              ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
              ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
              ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \
              ${filter_funcotations_args} \
-             ${extra_args}
+             ${extra_args_arg}
+         # Make sure we have a placeholder index for MAF files so this workflow doesn't fail:
+         if [[ "${output_format}" == "MAF" ]] ; then
+            touch ${output_maf_index}
+         fi
      >>>
 
      runtime {
@@ -1335,6 +1375,7 @@ task FuncotateMaf {
      }
 
      output {
-         File funcotated_output = "${final_output_filename}"
+         File funcotated_output_file = "${output_file}"
+         File funcotated_output_file_index = "${output_file_index}"
      }
  }