build(dockstore): tweak dockstore config, improve WDL docs

PharmGKB · Aug 27, 2024 · f66033b · f66033b
1 parent b46cae3
commit f66033b
Show file tree

Hide file tree

Showing 7 changed files with 169 additions and 1,250 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -0,0 +1,9 @@
+version: 1.2
+workflows:
+  - subclass: WDL
+    name: PharmCAT-Pipeline
+    primaryDescriptorPath: /pipeline/PharmCAT_Pipeline.wdl
+    readMePath: /pipeline/README.md
+    filters:
+      tags:
+        - /v\d+.*/
diff --git a/Makefile b/Makefile
@@ -39,7 +39,6 @@ updateData: clean
 	mv -f src/main/resources/org/pharmgkb/pharmcat/definition/alleles/pharmcat_positions.* .
 	cp -f pharmcat_positions.vcf src/test/resources/org/pharmgkb/pharmcat/reference.vcf
 	cp -f pharmcat_positions.vcf docs/examples/pharmcat.example.vcf
-	cp -f pharmcat_positions.vcf dockstore/pipeline/example.vcf
 	sed -e '/rs12769205/ s/0\/0/1\/1/g' pharmcat_positions.vcf | sed '/rs4244285/ s/0\/0/1\/1/g' | sed '/rs3758581/ s/0\/0/1\/1/g' | sed '/rs3745274/ s/0\/0/0\/1/g' | sed '/rs2279343/ s/0\/0/0\/1/g' > docs/examples/pharmcat.example2.vcf
 	@echo ""
 	@echo "Updating examples..."

diff --git a/dockstore.yml b/dockstore.yml
diff --git a/dockstore/pipeline/PharmCAT_Pipeline.wdl b/dockstore/pipeline/PharmCAT_Pipeline.wdl
@@ -1,127 +1,152 @@
 version 1.0
 
-# It is a single task that runs the PharmCAT pipeline on a VCF file
-# as a single task the workflow is just a wrapper for the task and get the same name
-
-# The output is an array of files that are the results of the pipeline save in the plataform where the workflow is running
-
-task pharmcat_pipeline {
-    input {
-        File vcf_file
-        String sample_ids = ""
-        File? sample_file
-        Boolean missing_to_ref = false
-        Boolean no_gvcf_check = false
-        Boolean retain_specific_regions = false
-        File? reference_regions
-        Boolean run_matcher = false
-        Boolean matcher_all_results = false
-        Boolean matcher_save_html = false
-        String research_mode = ""
-        Boolean run_phenotyper = false
-        Boolean run_reporter = false
-        String reporter_sources = ""
-        Boolean reporter_extended = false
-        Boolean reporter_save_json = false
-        String base_filename = ""
-        Boolean delete_intermediate_files = false
-        Int max_concurrent_processes = 1
-        String max_memory = "4G"
-    }
-
-    command <<<
-        set -x -e -o pipefail
-        mkdir -p data
-        cp ~{vcf_file} data/
-
-        pharmcat_pipeline data/$(basename ~{vcf_file}) \
-        ~{if sample_ids != "" then '-s ' + sample_ids else ''} \
-        ~{if defined(sample_file) then '-S ' + sample_file else ''} \
-        ~{if missing_to_ref then '-0' else ''} \
-        ~{if no_gvcf_check then '-G' else ''} \
-        ~{if retain_specific_regions then '-R' else ''} \
-        ~{if defined(reference_regions) then '-refRegion ' + reference_regions else ''} \
-        ~{if run_matcher then '-matcher' else ''} \
-        ~{if matcher_all_results then '-ma' else ''} \
-        ~{if matcher_save_html then '-matcherHtml' else ''} \
-        ~{if research_mode != "" then '-research ' + research_mode else ''} \
-        ~{if run_phenotyper then '-phenotyper' else ''} \
-        ~{if run_reporter then '-reporter' else ''} \
-        ~{if reporter_sources != "" then '-rs ' + reporter_sources else ''} \
-        ~{if reporter_extended then '-re' else ''} \
-        ~{if reporter_save_json then '-reporterJson' else ''} \
-        ~{if base_filename != "" then '-bf ' + base_filename else ''} \
-        ~{if delete_intermediate_files then '-del' else ''} \
-        -cp ~{max_concurrent_processes} -cm ~{max_memory}
-    >>>
-
-    output {
-        Array[File] results = glob("data/*")
-    }
-
-    runtime {
-        docker: "pgkb/pharmcat:2.15.2"
-        memory: max_memory
-        cpu: max_concurrent_processes
-    }
-
-    meta {
-        author: "PharmGKB"
-        email: "[email protected]"
-        description: "Workflow to run the PharmCAT pipeline on a VCF file"
-    }
+workflow pharmcat_pipeline {
+  meta {
+    author: "ClinPGx"
+    email: "[email protected]"
+    description: "This workflow runs a VCF file through the PharmCAT pipeline."
+  }
+
+  parameter_meta {
+    # description for this is intentionally different from pipeline script because it's hard to
+    # support a file of files on cloud services and directories aren't supported
+    vcf_file: "A VCF file (can be gzipped or bgzipped)."
+    sample_ids: "A comma-separated list of sample IDs.  Only applicable if you have multiple samples and only want to work on specific ones."
+    sample_file: "A file containing a list of sample IDs, one sample ID per line.  Only applicable if you have multiple samples and only want to work on specific ones."
+
+    missing_to_ref: "Assume genotypes at missing PGx sites are 0/0.  DANGEROUS!"
+    no_gvcf_check: "Bypass check if VCF file is in gVCF format."
+    # not including retain_specific_regions and reference_regions
+
+    run_matcher: "Run named allele matcher independently."
+    matcher_all_results: "Return all possible diplotypes, not just top hits."
+    matcher_save_html: "Save named allele matcher results as HTML.'"
+    research_mode: "Comma-separated list of research features to enable: [cyp2d6, combinations]"
+
+    run_phenotyper: "Run phenotyper independently."
+
+    run_reporter: "Run reporter independently."
+    reporter_sources: "Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA]"
+    reporter_extended: "Write an extended report (includes all possible genes and drugs, even if no data is available)"
+    reporter_save_json: "Save reporter results as JSON."
+
+    base_filename: "Prefix for output files.  Defaults to the same base name as the input."
+    delete_intermediate_files: "Delete intermediate PharmCAT files.  Defaults to saving all files."
+
+    max_concurrent_processes: "The maximum number of processes to use when concurrent mode is enabled."
+    max_memory: "The maximum memory PharmCAT should use (e.g. '64G')."
+  }
+
+
+  input {
+    File vcf_file
+    String sample_ids = ""
+    File? sample_file
+    Boolean missing_to_ref = false
+    Boolean no_gvcf_check = false
+    Boolean run_matcher = false
+    Boolean matcher_all_results = false
+    Boolean matcher_save_html = false
+    String research_mode = ""
+    Boolean run_phenotyper = false
+    Boolean run_reporter = false
+    String reporter_sources = ""
+    Boolean reporter_extended = false
+    Boolean reporter_save_json = false
+    String base_filename = ""
+    Boolean delete_intermediate_files = false
+    Int max_concurrent_processes = 1
+    String max_memory = "4G"
+  }
+
+  call pharmcat_pipeline_task {
+    input:
+      vcf_file = vcf_file,
+      sample_ids = sample_ids,
+      sample_file = sample_file,
+      missing_to_ref = missing_to_ref,
+      no_gvcf_check = no_gvcf_check,
+      run_matcher = run_matcher,
+      matcher_all_results = matcher_all_results,
+      matcher_save_html = matcher_save_html,
+      research_mode = research_mode,
+      run_phenotyper = run_phenotyper,
+      run_reporter = run_reporter,
+      reporter_sources = reporter_sources,
+      reporter_extended = reporter_extended,
+      reporter_save_json = reporter_save_json,
+      base_filename = base_filename,
+      delete_intermediate_files = delete_intermediate_files,
+      max_concurrent_processes = max_concurrent_processes,
+      max_memory = max_memory
+  }
+
+  output {
+    Array[File] results = pharmcat_pipeline_task.results
+  }
 }
 
-workflow pharmcat_pipeline {
-    input {
-        File vcf_file
-        String sample_ids = ""
-        File? sample_file
-        Boolean missing_to_ref = false
-        Boolean no_gvcf_check = false
-        Boolean retain_specific_regions = false
-        File? reference_regions
-        Boolean run_matcher = false
-        Boolean matcher_all_results = false
-        Boolean matcher_save_html = false
-        String research_mode = ""
-        Boolean run_phenotyper = false
-        Boolean run_reporter = false
-        String reporter_sources = ""
-        Boolean reporter_extended = false
-        Boolean reporter_save_json = false
-        String base_filename = ""
-        Boolean delete_intermediate_files = false
-        Int max_concurrent_processes = 1
-        String max_memory = "4G"
-    }
-
-    call pharmcat_pipeline {
-        input:
-            vcf_file = vcf_file,
-            sample_ids = sample_ids,
-            sample_file = sample_file,
-            missing_to_ref = missing_to_ref,
-            no_gvcf_check = no_gvcf_check,
-            retain_specific_regions = retain_specific_regions,
-            reference_regions = reference_regions,
-            run_matcher = run_matcher,
-            matcher_all_results = matcher_all_results,
-            matcher_save_html = matcher_save_html,
-            research_mode = research_mode,
-            run_phenotyper = run_phenotyper,
-            run_reporter = run_reporter,
-            reporter_sources = reporter_sources,
-            reporter_extended = reporter_extended,
-            reporter_save_json = reporter_save_json,
-            base_filename = base_filename,
-            delete_intermediate_files = delete_intermediate_files,
-            max_concurrent_processes = max_concurrent_processes,
-            max_memory = max_memory
-    }
-
-    output {
-        Array[File] results_all = pharmcat_pipeline.results
-    }
+
+task pharmcat_pipeline_task {
+  meta {
+    author: "ClinPGx"
+    email: "[email protected]"
+    description: "This task run a VCF file through the PharmCAT pipeline."
+  }
+
+  input {
+    File vcf_file
+    String sample_ids = ""
+    File? sample_file
+    Boolean missing_to_ref = false
+    Boolean no_gvcf_check = false
+    Boolean run_matcher = false
+    Boolean matcher_all_results = false
+    Boolean matcher_save_html = false
+    String research_mode = ""
+    Boolean run_phenotyper = false
+    Boolean run_reporter = false
+    String reporter_sources = ""
+    Boolean reporter_extended = false
+    Boolean reporter_save_json = false
+    String base_filename = ""
+    Boolean delete_intermediate_files = false
+    Int max_concurrent_processes = 1
+    String max_memory = "4G"
+  }
+
+  command <<<
+    set -x -e -o pipefail
+    mkdir -p data
+    cp ~{vcf_file} data/
+
+    pharmcat_pipeline data/$(basename ~{vcf_file}) \
+    ~{if sample_ids != "" then '-s ' + sample_ids else ''} \
+    ~{if defined(sample_file) then '-S ' + sample_file else ''} \
+    ~{if missing_to_ref then '-0' else ''} \
+    ~{if no_gvcf_check then '-G' else ''} \
+    ~{if run_matcher then '-matcher' else ''} \
+    ~{if matcher_all_results then '-ma' else ''} \
+    ~{if matcher_save_html then '-matcherHtml' else ''} \
+    ~{if research_mode != "" then '-research ' + research_mode else ''} \
+    ~{if run_phenotyper then '-phenotyper' else ''} \
+    ~{if run_reporter then '-reporter' else ''} \
+    ~{if reporter_sources != "" then '-rs ' + reporter_sources else ''} \
+    ~{if reporter_extended then '-re' else ''} \
+    ~{if reporter_save_json then '-reporterJson' else ''} \
+    ~{if base_filename != "" then '-bf ' + base_filename else ''} \
+    ~{if delete_intermediate_files then '-del' else ''} \
+    -cp ~{max_concurrent_processes} -cm ~{max_memory}
+  >>>
+
+  output {
+    Array[File] results = glob("data/*")
+  }
+
+  runtime {
+    docker: "pgkb/pharmcat:2.13.0"
+    memory: max_memory
+    cpu: max_concurrent_processes
+  }
 }
 
diff --git a/dockstore/pipeline/README.md b/dockstore/pipeline/README.md
@@ -1,7 +1,7 @@
-# WDL to run PharmCAT_Pipeline
+# WDL to run the PharmCAT pipeline
 
 This WDL script executes the PharmCAT pipeline on a specified VCF file or a set of VCF files, processing genetic data to
-provide pharmacogenomic insights. The workflow automates the execution of the PharmCAT pipeline, streamlining the
+provide pharmacogenomic insights. This workflow automates the execution of the PharmCAT pipeline, streamlining the
 analysis of genetic variants to predict drug response and tailor medical treatment to individual patients' genetic
 profiles. By leveraging the Workflow Description Language (WDL), this script ensures reproducibility, scalability, and
 ease of use across various computational environments.
@@ -16,16 +16,18 @@ For details, see:
 
 ## Input Parameters
 
+The only required input is a VCF file.
+An example VCF file you can use to test with can be found [here](https://pharmcat.org/examples/pharmcat.example.vcf). 
+
+
 ### Input Arguments
-- `File vcf_file`: Path to a VCF file or a file of paths to VCF files (one file per line), sorted by chromosome position.
-- `String sample_ids` (default: `""`): A comma-separated list of sample IDs.
-- `File? sample_file` (default: `null`): A file containing a list of samples, one sample per line.
+- `File vcf_file`: Path to a VCF file or a directory containing VCF files.
+- `String sample_ids` (default: `""`): A comma-separated list of sample IDs. Only applicable if you have multiple samples and only want to work on specific ones.
+- `File? sample_file` (default: `null`): A file containing a list of samples, one sample per line. Only applicable if you have multiple samples and only want to work on specific ones.
 
 ### Preprocessor Arguments
-- `Boolean missing_to_ref` (default: `false`): Assume genotypes at missing PGx sites are 0/0. DANGEROUS!.
-- `Boolean no_gvcf_check` (default: `false`): Bypass the gVCF check for the input VCF. DANGEROUS!.
-- `Boolean retain_specific_regions` (default: `false`): Retain the genomic regions specified by `-refRegion`.
-- `File? reference_regions` (default: `null`): A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument.
+- `Boolean missing_to_ref` (default: `false`): Assume genotypes at missing PGx sites are 0/0. DANGEROUS!
+- `Boolean no_gvcf_check` (default: `false`): Bypass the gVCF check for the input VCF.
 
 ### Named Allele Matcher Arguments
 - `Boolean run_matcher` (default: `false`): Run named allele matcher independently.
@@ -38,13 +40,13 @@ For details, see:
 
 ### Reporter Arguments
 - `Boolean run_reporter` (default: `false`): Run reporter independently.
-- `String reporter_sources` (default: `""`): Comma-separated list of sources to limit report to: [CPIC, DPWG].
-- `Boolean reporter_extended` (default: `false`): Output extended report.
+- `String reporter_sources` (default: `""`): Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA].
+- `Boolean reporter_extended` (default: `false`): Write an extended report (includes all possible genes and drugs, even if no data is available)
 - `Boolean reporter_save_json` (default: `false`): Save reporter results as JSON.
 
 ### Output Arguments
 - `String base_filename` (default: `""`): Prefix for output files. Defaults to the same base name as the input.
-- `Boolean delete_intermediate_files` (default: `false`): Delete intermediate PharmCAT files (saved by default).
+- `Boolean delete_intermediate_files` (default: `false`): Delete intermediate PharmCAT files. Defaults to saving all files.
 
 ### Concurrency/Memory Arguments
 - `Int max_concurrent_processes` (default: `1`): The maximum number of processes to use when concurrent mode is enabled.