Skip to content

Commit

Permalink
build(dockstore): tweak dockstore config, improve WDL docs
Browse files Browse the repository at this point in the history
  • Loading branch information
markwoon committed Aug 27, 2024
1 parent b46cae3 commit f66033b
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 1,250 deletions.
9 changes: 9 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: 1.2
workflows:
- subclass: WDL
name: PharmCAT-Pipeline
primaryDescriptorPath: /pipeline/PharmCAT_Pipeline.wdl
readMePath: /pipeline/README.md
filters:
tags:
- /v\d+.*/
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ updateData: clean
mv -f src/main/resources/org/pharmgkb/pharmcat/definition/alleles/pharmcat_positions.* .
cp -f pharmcat_positions.vcf src/test/resources/org/pharmgkb/pharmcat/reference.vcf
cp -f pharmcat_positions.vcf docs/examples/pharmcat.example.vcf
cp -f pharmcat_positions.vcf dockstore/pipeline/example.vcf
sed -e '/rs12769205/ s/0\/0/1\/1/g' pharmcat_positions.vcf | sed '/rs4244285/ s/0\/0/1\/1/g' | sed '/rs3758581/ s/0\/0/1\/1/g' | sed '/rs3745274/ s/0\/0/0\/1/g' | sed '/rs2279343/ s/0\/0/0\/1/g' > docs/examples/pharmcat.example2.vcf
@echo ""
@echo "Updating examples..."
Expand Down
10 changes: 0 additions & 10 deletions dockstore.yml

This file was deleted.

267 changes: 146 additions & 121 deletions dockstore/pipeline/PharmCAT_Pipeline.wdl
Original file line number Diff line number Diff line change
@@ -1,127 +1,152 @@
version 1.0

# It is a single task that runs the PharmCAT pipeline on a VCF file
# as a single task the workflow is just a wrapper for the task and get the same name
# The output is an array of files that are the results of the pipeline save in the plataform where the workflow is running
task pharmcat_pipeline {
input {
File vcf_file
String sample_ids = ""
File? sample_file
Boolean missing_to_ref = false
Boolean no_gvcf_check = false
Boolean retain_specific_regions = false
File? reference_regions
Boolean run_matcher = false
Boolean matcher_all_results = false
Boolean matcher_save_html = false
String research_mode = ""
Boolean run_phenotyper = false
Boolean run_reporter = false
String reporter_sources = ""
Boolean reporter_extended = false
Boolean reporter_save_json = false
String base_filename = ""
Boolean delete_intermediate_files = false
Int max_concurrent_processes = 1
String max_memory = "4G"
}

command <<<
set -x -e -o pipefail
mkdir -p data
cp ~{vcf_file} data/

pharmcat_pipeline data/$(basename ~{vcf_file}) \
~{if sample_ids != "" then '-s ' + sample_ids else ''} \
~{if defined(sample_file) then '-S ' + sample_file else ''} \
~{if missing_to_ref then '-0' else ''} \
~{if no_gvcf_check then '-G' else ''} \
~{if retain_specific_regions then '-R' else ''} \
~{if defined(reference_regions) then '-refRegion ' + reference_regions else ''} \
~{if run_matcher then '-matcher' else ''} \
~{if matcher_all_results then '-ma' else ''} \
~{if matcher_save_html then '-matcherHtml' else ''} \
~{if research_mode != "" then '-research ' + research_mode else ''} \
~{if run_phenotyper then '-phenotyper' else ''} \
~{if run_reporter then '-reporter' else ''} \
~{if reporter_sources != "" then '-rs ' + reporter_sources else ''} \
~{if reporter_extended then '-re' else ''} \
~{if reporter_save_json then '-reporterJson' else ''} \
~{if base_filename != "" then '-bf ' + base_filename else ''} \
~{if delete_intermediate_files then '-del' else ''} \
-cp ~{max_concurrent_processes} -cm ~{max_memory}
>>>

output {
Array[File] results = glob("data/*")
}

runtime {
docker: "pgkb/pharmcat:2.15.2"
memory: max_memory
cpu: max_concurrent_processes
}

meta {
author: "PharmGKB"
email: "[email protected]"
description: "Workflow to run the PharmCAT pipeline on a VCF file"
}
workflow pharmcat_pipeline {
meta {
author: "ClinPGx"
email: "[email protected]"
description: "This workflow runs a VCF file through the PharmCAT pipeline."
}

parameter_meta {
# description for this is intentionally different from pipeline script because it's hard to
# support a file of files on cloud services and directories aren't supported
vcf_file: "A VCF file (can be gzipped or bgzipped)."
sample_ids: "A comma-separated list of sample IDs. Only applicable if you have multiple samples and only want to work on specific ones."
sample_file: "A file containing a list of sample IDs, one sample ID per line. Only applicable if you have multiple samples and only want to work on specific ones."

missing_to_ref: "Assume genotypes at missing PGx sites are 0/0. DANGEROUS!"
no_gvcf_check: "Bypass check if VCF file is in gVCF format."
# not including retain_specific_regions and reference_regions
run_matcher: "Run named allele matcher independently."
matcher_all_results: "Return all possible diplotypes, not just top hits."
matcher_save_html: "Save named allele matcher results as HTML.'"
research_mode: "Comma-separated list of research features to enable: [cyp2d6, combinations]"

run_phenotyper: "Run phenotyper independently."

run_reporter: "Run reporter independently."
reporter_sources: "Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA]"
reporter_extended: "Write an extended report (includes all possible genes and drugs, even if no data is available)"
reporter_save_json: "Save reporter results as JSON."

base_filename: "Prefix for output files. Defaults to the same base name as the input."
delete_intermediate_files: "Delete intermediate PharmCAT files. Defaults to saving all files."

max_concurrent_processes: "The maximum number of processes to use when concurrent mode is enabled."
max_memory: "The maximum memory PharmCAT should use (e.g. '64G')."
}


input {
File vcf_file
String sample_ids = ""
File? sample_file
Boolean missing_to_ref = false
Boolean no_gvcf_check = false
Boolean run_matcher = false
Boolean matcher_all_results = false
Boolean matcher_save_html = false
String research_mode = ""
Boolean run_phenotyper = false
Boolean run_reporter = false
String reporter_sources = ""
Boolean reporter_extended = false
Boolean reporter_save_json = false
String base_filename = ""
Boolean delete_intermediate_files = false
Int max_concurrent_processes = 1
String max_memory = "4G"
}

call pharmcat_pipeline_task {
input:
vcf_file = vcf_file,
sample_ids = sample_ids,
sample_file = sample_file,
missing_to_ref = missing_to_ref,
no_gvcf_check = no_gvcf_check,
run_matcher = run_matcher,
matcher_all_results = matcher_all_results,
matcher_save_html = matcher_save_html,
research_mode = research_mode,
run_phenotyper = run_phenotyper,
run_reporter = run_reporter,
reporter_sources = reporter_sources,
reporter_extended = reporter_extended,
reporter_save_json = reporter_save_json,
base_filename = base_filename,
delete_intermediate_files = delete_intermediate_files,
max_concurrent_processes = max_concurrent_processes,
max_memory = max_memory
}

output {
Array[File] results = pharmcat_pipeline_task.results
}
}

workflow pharmcat_pipeline {
input {
File vcf_file
String sample_ids = ""
File? sample_file
Boolean missing_to_ref = false
Boolean no_gvcf_check = false
Boolean retain_specific_regions = false
File? reference_regions
Boolean run_matcher = false
Boolean matcher_all_results = false
Boolean matcher_save_html = false
String research_mode = ""
Boolean run_phenotyper = false
Boolean run_reporter = false
String reporter_sources = ""
Boolean reporter_extended = false
Boolean reporter_save_json = false
String base_filename = ""
Boolean delete_intermediate_files = false
Int max_concurrent_processes = 1
String max_memory = "4G"
}

call pharmcat_pipeline {
input:
vcf_file = vcf_file,
sample_ids = sample_ids,
sample_file = sample_file,
missing_to_ref = missing_to_ref,
no_gvcf_check = no_gvcf_check,
retain_specific_regions = retain_specific_regions,
reference_regions = reference_regions,
run_matcher = run_matcher,
matcher_all_results = matcher_all_results,
matcher_save_html = matcher_save_html,
research_mode = research_mode,
run_phenotyper = run_phenotyper,
run_reporter = run_reporter,
reporter_sources = reporter_sources,
reporter_extended = reporter_extended,
reporter_save_json = reporter_save_json,
base_filename = base_filename,
delete_intermediate_files = delete_intermediate_files,
max_concurrent_processes = max_concurrent_processes,
max_memory = max_memory
}

output {
Array[File] results_all = pharmcat_pipeline.results
}

task pharmcat_pipeline_task {
meta {
author: "ClinPGx"
email: "[email protected]"
description: "This task run a VCF file through the PharmCAT pipeline."
}

input {
File vcf_file
String sample_ids = ""
File? sample_file
Boolean missing_to_ref = false
Boolean no_gvcf_check = false
Boolean run_matcher = false
Boolean matcher_all_results = false
Boolean matcher_save_html = false
String research_mode = ""
Boolean run_phenotyper = false
Boolean run_reporter = false
String reporter_sources = ""
Boolean reporter_extended = false
Boolean reporter_save_json = false
String base_filename = ""
Boolean delete_intermediate_files = false
Int max_concurrent_processes = 1
String max_memory = "4G"
}

command <<<
set -x -e -o pipefail
mkdir -p data
cp ~{vcf_file} data/

pharmcat_pipeline data/$(basename ~{vcf_file}) \
~{if sample_ids != "" then '-s ' + sample_ids else ''} \
~{if defined(sample_file) then '-S ' + sample_file else ''} \
~{if missing_to_ref then '-0' else ''} \
~{if no_gvcf_check then '-G' else ''} \
~{if run_matcher then '-matcher' else ''} \
~{if matcher_all_results then '-ma' else ''} \
~{if matcher_save_html then '-matcherHtml' else ''} \
~{if research_mode != "" then '-research ' + research_mode else ''} \
~{if run_phenotyper then '-phenotyper' else ''} \
~{if run_reporter then '-reporter' else ''} \
~{if reporter_sources != "" then '-rs ' + reporter_sources else ''} \
~{if reporter_extended then '-re' else ''} \
~{if reporter_save_json then '-reporterJson' else ''} \
~{if base_filename != "" then '-bf ' + base_filename else ''} \
~{if delete_intermediate_files then '-del' else ''} \
-cp ~{max_concurrent_processes} -cm ~{max_memory}
>>>

output {
Array[File] results = glob("data/*")
}

runtime {
docker: "pgkb/pharmcat:2.13.0"
memory: max_memory
cpu: max_concurrent_processes
}
}

26 changes: 14 additions & 12 deletions dockstore/pipeline/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# WDL to run PharmCAT_Pipeline
# WDL to run the PharmCAT pipeline

This WDL script executes the PharmCAT pipeline on a specified VCF file or a set of VCF files, processing genetic data to
provide pharmacogenomic insights. The workflow automates the execution of the PharmCAT pipeline, streamlining the
provide pharmacogenomic insights. This workflow automates the execution of the PharmCAT pipeline, streamlining the
analysis of genetic variants to predict drug response and tailor medical treatment to individual patients' genetic
profiles. By leveraging the Workflow Description Language (WDL), this script ensures reproducibility, scalability, and
ease of use across various computational environments.
Expand All @@ -16,16 +16,18 @@ For details, see:

## Input Parameters

The only required input is a VCF file.
An example VCF file you can use to test with can be found [here](https://pharmcat.org/examples/pharmcat.example.vcf).


### Input Arguments
- `File vcf_file`: Path to a VCF file or a file of paths to VCF files (one file per line), sorted by chromosome position.
- `String sample_ids` (default: `""`): A comma-separated list of sample IDs.
- `File? sample_file` (default: `null`): A file containing a list of samples, one sample per line.
- `File vcf_file`: Path to a VCF file or a directory containing VCF files.
- `String sample_ids` (default: `""`): A comma-separated list of sample IDs. Only applicable if you have multiple samples and only want to work on specific ones.
- `File? sample_file` (default: `null`): A file containing a list of samples, one sample per line. Only applicable if you have multiple samples and only want to work on specific ones.

### Preprocessor Arguments
- `Boolean missing_to_ref` (default: `false`): Assume genotypes at missing PGx sites are 0/0. DANGEROUS!.
- `Boolean no_gvcf_check` (default: `false`): Bypass the gVCF check for the input VCF. DANGEROUS!.
- `Boolean retain_specific_regions` (default: `false`): Retain the genomic regions specified by `-refRegion`.
- `File? reference_regions` (default: `null`): A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument.
- `Boolean missing_to_ref` (default: `false`): Assume genotypes at missing PGx sites are 0/0. DANGEROUS!
- `Boolean no_gvcf_check` (default: `false`): Bypass the gVCF check for the input VCF.

### Named Allele Matcher Arguments
- `Boolean run_matcher` (default: `false`): Run named allele matcher independently.
Expand All @@ -38,13 +40,13 @@ For details, see:

### Reporter Arguments
- `Boolean run_reporter` (default: `false`): Run reporter independently.
- `String reporter_sources` (default: `""`): Comma-separated list of sources to limit report to: [CPIC, DPWG].
- `Boolean reporter_extended` (default: `false`): Output extended report.
- `String reporter_sources` (default: `""`): Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA].
- `Boolean reporter_extended` (default: `false`): Write an extended report (includes all possible genes and drugs, even if no data is available)
- `Boolean reporter_save_json` (default: `false`): Save reporter results as JSON.

### Output Arguments
- `String base_filename` (default: `""`): Prefix for output files. Defaults to the same base name as the input.
- `Boolean delete_intermediate_files` (default: `false`): Delete intermediate PharmCAT files (saved by default).
- `Boolean delete_intermediate_files` (default: `false`): Delete intermediate PharmCAT files. Defaults to saving all files.

### Concurrency/Memory Arguments
- `Int max_concurrent_processes` (default: `1`): The maximum number of processes to use when concurrent mode is enabled.
Expand Down
Loading

0 comments on commit f66033b

Please sign in to comment.